Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/memory.c

2

* linux/mm/memory.c

3

*

3

*

4

5

*/

5

*/

6

7

/*

7

/*

8

* demand-loading started 01.12.91 - seems it is high on the list of

8

* demand-loading started 01.12.91 - seems it is high on the list of

9

* things wanted, and it should be easy to implement. - Linus

9

* things wanted, and it should be easy to implement. - Linus

10

*/

10

*/

11

12

/*

12

/*

13

* Ok, demand-loading was easy, shared pages a little bit tricker. Shared

13

* Ok, demand-loading was easy, shared pages a little bit tricker. Shared

14

* pages started 02.12.91, seems to work. - Linus.

14

* pages started 02.12.91, seems to work. - Linus.

15

*

15

*

16

* Tested sharing by executing about 30 /bin/sh: under the old kernel it

16

* Tested sharing by executing about 30 /bin/sh: under the old kernel it

17

* would have taken more than the 6M I have free, but it worked well as

17

* would have taken more than the 6M I have free, but it worked well as

18

* far as I could see.

18

* far as I could see.

19

*

19

*

20

* Also corrected some "invalidate()"s - I wasn't doing enough of them.

20

* Also corrected some "invalidate()"s - I wasn't doing enough of them.

21

*/

21

*/

22

23

/*

23

/*

24

* Real VM (paging to/from disk) started 18.12.91. Much more work and

24

* Real VM (paging to/from disk) started 18.12.91. Much more work and

25

* thought has to go into this. Oh, well..

25

* thought has to go into this. Oh, well..

26

* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.

26

* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.

27

* Found it. Everything seems to work now.

27

* Found it. Everything seems to work now.

28

* 20.12.91 - Ok, making the swap-device changeable like the root.

28

* 20.12.91 - Ok, making the swap-device changeable like the root.

29

*/

29

*/

30

31

/*

31

/*

32

* 05.04.94 - Multi-page memory management added for v1.1.

32

* 05.04.94 - Multi-page memory management added for v1.1.

33

* Idea by Alex Bligh (alex@cconcepts.co.uk)

33

* Idea by Alex Bligh (alex@cconcepts.co.uk)

34

*

34

*

35

* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG

35

* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG

36

* (Gerhard.Wichert@pdb.siemens.de)

36

* (Gerhard.Wichert@pdb.siemens.de)

37

*

37

*

38

* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)

38

* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)

39

*/

39

*/

40

41

#include <linux/kernel_stat.h>

41

#include <linux/kernel_stat.h>

42

#include <linux/mm.h>

42

#include <linux/mm.h>

43

#include <linux/hugetlb.h>

43

#include <linux/hugetlb.h>

44

#include <linux/mman.h>

44

#include <linux/mman.h>

45

#include <linux/swap.h>

45

#include <linux/swap.h>

46

#include <linux/highmem.h>

46

#include <linux/highmem.h>

47

#include <linux/pagemap.h>

47

#include <linux/pagemap.h>

48

#include <linux/ksm.h>

48

#include <linux/ksm.h>

49

#include <linux/rmap.h>

49

#include <linux/rmap.h>

50

#include <linux/export.h>

50

#include <linux/export.h>

51

#include <linux/delayacct.h>

51

#include <linux/delayacct.h>

52

#include <linux/init.h>

52

#include <linux/init.h>

53

#include <linux/writeback.h>

53

#include <linux/writeback.h>

54

#include <linux/memcontrol.h>

54

#include <linux/memcontrol.h>

55

#include <linux/mmu_notifier.h>

55

#include <linux/mmu_notifier.h>

56

#include <linux/kallsyms.h>

56

#include <linux/kallsyms.h>

57

#include <linux/swapops.h>

57

#include <linux/swapops.h>

58

#include <linux/elf.h>

58

#include <linux/elf.h>

59

#include <linux/gfp.h>

59

#include <linux/gfp.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/string.h>

61

#include <linux/string.h>

62

63

#include <asm/io.h>

63

#include <asm/io.h>

64

#include <asm/pgalloc.h>

64

#include <asm/pgalloc.h>

65

#include <asm/uaccess.h>

65

#include <asm/uaccess.h>

66

#include <asm/tlb.h>

66

#include <asm/tlb.h>

67

#include <asm/tlbflush.h>

67

#include <asm/tlbflush.h>

68

#include <asm/pgtable.h>

68

#include <asm/pgtable.h>

69

70

#include "internal.h"

70

#include "internal.h"

71

72

#ifdef LAST_NID_NOT_IN_PAGE_FLAGS

72

#ifdef LAST_NID_NOT_IN_PAGE_FLAGS

73

#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.

73

#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.

74

#endif

74

#endif

75

76

#ifndef CONFIG_NEED_MULTIPLE_NODES

76

#ifndef CONFIG_NEED_MULTIPLE_NODES

77

/* use the per-pgdat data instead for discontigmem - mbligh */

77

/* use the per-pgdat data instead for discontigmem - mbligh */

78

unsigned long max_mapnr;

78

unsigned long max_mapnr;

79

struct page *mem_map;

79

struct page *mem_map;

80

81

EXPORT_SYMBOL(max_mapnr);

81

EXPORT_SYMBOL(max_mapnr);

82

EXPORT_SYMBOL(mem_map);

82

EXPORT_SYMBOL(mem_map);

83

#endif

83

#endif

84

85

unsigned long num_physpages;

85

unsigned long num_physpages;

86

/*

86

/*

87

* A number of key systems in x86 including ioremap() rely on the assumption

87

* A number of key systems in x86 including ioremap() rely on the assumption

88

* that high_memory defines the upper bound on direct map memory, then end

88

* that high_memory defines the upper bound on direct map memory, then end

89

* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and

89

* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and

90

* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL

90

* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL

91

* and ZONE_HIGHMEM.

91

* and ZONE_HIGHMEM.

92

*/

92

*/

93

void * high_memory;

93

void * high_memory;

94

95

EXPORT_SYMBOL(num_physpages);

95

EXPORT_SYMBOL(num_physpages);

96

EXPORT_SYMBOL(high_memory);

96

EXPORT_SYMBOL(high_memory);

97

98

/*

98

/*

99

* Randomize the address space (stacks, mmaps, brk, etc.).

99

* Randomize the address space (stacks, mmaps, brk, etc.).

100

*

100

*

101

* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,

101

* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,

102

* as ancient (libc5 based) binaries can segfault. )

102

* as ancient (libc5 based) binaries can segfault. )

103

*/

103

*/

104

int randomize_va_space __read_mostly =

104

int randomize_va_space __read_mostly =

105

#ifdef CONFIG_COMPAT_BRK

105

#ifdef CONFIG_COMPAT_BRK

106

1;

106

1;

107

#else

107

#else

108

2;

108

2;

109

#endif

109

#endif

110

111

static int __init disable_randmaps(char *s)

111

static int __init disable_randmaps(char *s)

112

{

112

{

113

randomize_va_space = 0;

113

randomize_va_space = 0;

114

return 1;

114

return 1;

115

}

115

}

116

__setup("norandmaps", disable_randmaps);

116

__setup("norandmaps", disable_randmaps);

117

118

unsigned long zero_pfn __read_mostly;

118

unsigned long zero_pfn __read_mostly;

119

unsigned long highest_memmap_pfn __read_mostly;

119

unsigned long highest_memmap_pfn __read_mostly;

120

121

/*

121

/*

122

* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()

122

* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()

123

*/

123

*/

124

static int __init init_zero_pfn(void)

124

static int __init init_zero_pfn(void)

125

{

125

{

126

zero_pfn = page_to_pfn(ZERO_PAGE(0));

126

zero_pfn = page_to_pfn(ZERO_PAGE(0));

127

return 0;

127

return 0;

128

}

128

}

129

core_initcall(init_zero_pfn);

129

core_initcall(init_zero_pfn);

130

131

132

#if defined(SPLIT_RSS_COUNTING)

132

#if defined(SPLIT_RSS_COUNTING)

133

134

void sync_mm_rss(struct mm_struct *mm)

134

void sync_mm_rss(struct mm_struct *mm)

135

{

135

{

136

int i;

136

int i;

137

138

for (i = 0; i < NR_MM_COUNTERS; i++) {

138

for (i = 0; i < NR_MM_COUNTERS; i++) {

139

if (current->rss_stat.count[i]) {

139

if (current->rss_stat.count[i]) {

140

add_mm_counter(mm, i, current->rss_stat.count[i]);

140

add_mm_counter(mm, i, current->rss_stat.count[i]);

141

current->rss_stat.count[i] = 0;

141

current->rss_stat.count[i] = 0;

142

}

142

}

143

}

143

}

144

current->rss_stat.events = 0;

144

current->rss_stat.events = 0;

145

}

145

}

146

147

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)

147

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)

148

{

148

{

149

struct task_struct *task = current;

149

struct task_struct *task = current;

150

151

if (likely(task->mm == mm))

151

if (likely(task->mm == mm))

152

task->rss_stat.count[member] += val;

152

task->rss_stat.count[member] += val;

153

else

153

else

154

add_mm_counter(mm, member, val);

154

add_mm_counter(mm, member, val);

155

}

155

}

156

#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)

156

#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)

157

#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

157

#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

158

159

/* sync counter once per 64 page faults */

159

/* sync counter once per 64 page faults */

160

#define TASK_RSS_EVENTS_THRESH (64)

160

#define TASK_RSS_EVENTS_THRESH (64)

161

static void check_sync_rss_stat(struct task_struct *task)

161

static void check_sync_rss_stat(struct task_struct *task)

162

{

162

{

163

if (unlikely(task != current))

163

if (unlikely(task != current))

164

return;

164

return;

165

if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))

165

if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))

166

sync_mm_rss(task->mm);

166

sync_mm_rss(task->mm);

167

}

167

}

168

#else /* SPLIT_RSS_COUNTING */

168

#else /* SPLIT_RSS_COUNTING */

169

170

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)

170

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)

171

#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

171

#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

172

173

static void check_sync_rss_stat(struct task_struct *task)

173

static void check_sync_rss_stat(struct task_struct *task)

174

{

174

{

175

}

175

}

176

177

#endif /* SPLIT_RSS_COUNTING */

177

#endif /* SPLIT_RSS_COUNTING */

178

179

#ifdef HAVE_GENERIC_MMU_GATHER

179

#ifdef HAVE_GENERIC_MMU_GATHER

180

181

static int tlb_next_batch(struct mmu_gather *tlb)

181

static int tlb_next_batch(struct mmu_gather *tlb)

182

{

182

{

183

struct mmu_gather_batch *batch;

183

struct mmu_gather_batch *batch;

184

185

batch = tlb->active;

185

batch = tlb->active;

186

if (batch->next) {

186

if (batch->next) {

187

tlb->active = batch->next;

187

tlb->active = batch->next;

188

return 1;

188

return 1;

189

}

189

}

190

191

if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)

191

if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)

192

return 0;

192

return 0;

193

194

batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);

194

batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);

195

if (!batch)

195

if (!batch)

196

return 0;

196

return 0;

197

198

tlb->batch_count++;

198

tlb->batch_count++;

199

batch->next = NULL;

199

batch->next = NULL;

200

batch->nr = 0;

200

batch->nr = 0;

201

batch->max = MAX_GATHER_BATCH;

201

batch->max = MAX_GATHER_BATCH;

202

203

tlb->active->next = batch;

203

tlb->active->next = batch;

204

tlb->active = batch;

204

tlb->active = batch;

205

206

return 1;

206

return 1;

207

}

207

}

208

209

/* tlb_gather_mmu

209

/* tlb_gather_mmu

210

* Called to initialize an (on-stack) mmu_gather structure for page-table

210

* Called to initialize an (on-stack) mmu_gather structure for page-table

211

* tear-down from @mm. The @fullmm argument is used when @mm is without

211

* tear-down from @mm. The @fullmm argument is used when @mm is without

212

* users and we're going to destroy the full address space (exit/execve).

212

* users and we're going to destroy the full address space (exit/execve).

213

*/

213

*/

214

void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)

214

void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)

215

{

215

{

216

tlb->mm = mm;

216

tlb->mm = mm;

217

218

tlb->fullmm = fullmm;

218

tlb->fullmm = fullmm;

219

tlb->need_flush_all = 0;

219

tlb->need_flush_all = 0;

220

tlb->start = -1UL;

220

tlb->start = -1UL;

221

tlb->end = 0;

221

tlb->end = 0;

222

tlb->need_flush = 0;

222

tlb->need_flush = 0;

223

tlb->fast_mode = (num_possible_cpus() == 1);

223

tlb->fast_mode = (num_possible_cpus() == 1);

224

tlb->local.next = NULL;

224

tlb->local.next = NULL;

225

tlb->local.nr = 0;

225

tlb->local.nr = 0;

226

tlb->local.max = ARRAY_SIZE(tlb->__pages);

226

tlb->local.max = ARRAY_SIZE(tlb->__pages);

227

tlb->active = &tlb->local;

227

tlb->active = &tlb->local;

228

tlb->batch_count = 0;

228

tlb->batch_count = 0;

229

230

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

230

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

231

tlb->batch = NULL;

231

tlb->batch = NULL;

232

#endif

232

#endif

233

}

233

}

234

235

void tlb_flush_mmu(struct mmu_gather *tlb)

235

void tlb_flush_mmu(struct mmu_gather *tlb)

236

{

236

{

237

struct mmu_gather_batch *batch;

237

struct mmu_gather_batch *batch;

238

239

if (!tlb->need_flush)

239

if (!tlb->need_flush)

240

return;

240

return;

241

tlb->need_flush = 0;

241

tlb->need_flush = 0;

242

tlb_flush(tlb);

242

tlb_flush(tlb);

243

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

243

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

244

tlb_table_flush(tlb);

244

tlb_table_flush(tlb);

245

#endif

245

#endif

246

247

if (tlb_fast_mode(tlb))

247

if (tlb_fast_mode(tlb))

248

return;

248

return;

249

250

for (batch = &tlb->local; batch; batch = batch->next) {

250

for (batch = &tlb->local; batch; batch = batch->next) {

251

free_pages_and_swap_cache(batch->pages, batch->nr);

251

free_pages_and_swap_cache(batch->pages, batch->nr);

252

batch->nr = 0;

252

batch->nr = 0;

253

}

253

}

254

tlb->active = &tlb->local;

254

tlb->active = &tlb->local;

255

}

255

}

256

257

/* tlb_finish_mmu

257

/* tlb_finish_mmu

258

* Called at the end of the shootdown operation to free up any resources

258

* Called at the end of the shootdown operation to free up any resources

259

* that were required.

259

* that were required.

260

*/

260

*/

261

void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)

261

void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)

262

{

262

{

263

struct mmu_gather_batch *batch, *next;

263

struct mmu_gather_batch *batch, *next;

264

265

tlb->start = start;

265

tlb->start = start;

266

tlb->end = end;

266

tlb->end = end;

267

tlb_flush_mmu(tlb);

267

tlb_flush_mmu(tlb);

268

269

/* keep the page table cache within bounds */

269

/* keep the page table cache within bounds */

270

check_pgt_cache();

270

check_pgt_cache();

271

272

for (batch = tlb->local.next; batch; batch = next) {

272

for (batch = tlb->local.next; batch; batch = next) {

273

next = batch->next;

273

next = batch->next;

274

free_pages((unsigned long)batch, 0);

274

free_pages((unsigned long)batch, 0);

275

}

275

}

276

tlb->local.next = NULL;

276

tlb->local.next = NULL;

277

}

277

}

278

279

/* __tlb_remove_page

279

/* __tlb_remove_page

280

* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while

280

* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while

281

* handling the additional races in SMP caused by other CPUs caching valid

281

* handling the additional races in SMP caused by other CPUs caching valid

282

* mappings in their TLBs. Returns the number of free page slots left.

282

* mappings in their TLBs. Returns the number of free page slots left.

283

* When out of page slots we must call tlb_flush_mmu().

283

* When out of page slots we must call tlb_flush_mmu().

284

*/

284

*/

285

int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)

285

int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)

286

{

286

{

287

struct mmu_gather_batch *batch;

287

struct mmu_gather_batch *batch;

288

289

VM_BUG_ON(!tlb->need_flush);

289

VM_BUG_ON(!tlb->need_flush);

290

291

if (tlb_fast_mode(tlb)) {

291

if (tlb_fast_mode(tlb)) {

292

free_page_and_swap_cache(page);

292

free_page_and_swap_cache(page);

293

return 1; /* avoid calling tlb_flush_mmu() */

293

return 1; /* avoid calling tlb_flush_mmu() */

294

}

294

}

295

296

batch = tlb->active;

296

batch = tlb->active;

297

batch->pages[batch->nr++] = page;

297

batch->pages[batch->nr++] = page;

298

if (batch->nr == batch->max) {

298

if (batch->nr == batch->max) {

299

if (!tlb_next_batch(tlb))

299

if (!tlb_next_batch(tlb))

300

return 0;

300

return 0;

301

batch = tlb->active;

301

batch = tlb->active;

302

}

302

}

303

VM_BUG_ON(batch->nr > batch->max);

303

VM_BUG_ON(batch->nr > batch->max);

304

305

return batch->max - batch->nr;

305

return batch->max - batch->nr;

306

}

306

}

307

308

#endif /* HAVE_GENERIC_MMU_GATHER */

308

#endif /* HAVE_GENERIC_MMU_GATHER */

309

310

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

310

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

311

312

/*

312

/*

313

* See the comment near struct mmu_table_batch.

313

* See the comment near struct mmu_table_batch.

314

*/

314

*/

315

316

static void tlb_remove_table_smp_sync(void *arg)

316

static void tlb_remove_table_smp_sync(void *arg)

317

{

317

{

318

/* Simply deliver the interrupt */

318

/* Simply deliver the interrupt */

319

}

319

}

320

321

static void tlb_remove_table_one(void *table)

321

static void tlb_remove_table_one(void *table)

322

{

322

{

323

/*

323

/*

324

* This isn't an RCU grace period and hence the page-tables cannot be

324

* This isn't an RCU grace period and hence the page-tables cannot be

325

* assumed to be actually RCU-freed.

325

* assumed to be actually RCU-freed.

326

*

326

*

327

* It is however sufficient for software page-table walkers that rely on

327

* It is however sufficient for software page-table walkers that rely on

328

* IRQ disabling. See the comment near struct mmu_table_batch.

328

* IRQ disabling. See the comment near struct mmu_table_batch.

329

*/

329

*/

330

smp_call_function(tlb_remove_table_smp_sync, NULL, 1);

330

smp_call_function(tlb_remove_table_smp_sync, NULL, 1);

331

__tlb_remove_table(table);

331

__tlb_remove_table(table);

332

}

332

}

333

334

static void tlb_remove_table_rcu(struct rcu_head *head)

334

static void tlb_remove_table_rcu(struct rcu_head *head)

335

{

335

{

336

struct mmu_table_batch *batch;

336

struct mmu_table_batch *batch;

337

int i;

337

int i;

338

339

batch = container_of(head, struct mmu_table_batch, rcu);

339

batch = container_of(head, struct mmu_table_batch, rcu);

340

341

for (i = 0; i < batch->nr; i++)

341

for (i = 0; i < batch->nr; i++)

342

__tlb_remove_table(batch->tables[i]);

342

__tlb_remove_table(batch->tables[i]);

343

344

free_page((unsigned long)batch);

344

free_page((unsigned long)batch);

345

}

345

}

346

347

void tlb_table_flush(struct mmu_gather *tlb)

347

void tlb_table_flush(struct mmu_gather *tlb)

348

{

348

{

349

struct mmu_table_batch **batch = &tlb->batch;

349

struct mmu_table_batch **batch = &tlb->batch;

350

351

if (*batch) {

351

if (*batch) {

352

call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);

352

call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);

353

*batch = NULL;

353

*batch = NULL;

354

}

354

}

355

}

355

}

356

357

void tlb_remove_table(struct mmu_gather *tlb, void *table)

357

void tlb_remove_table(struct mmu_gather *tlb, void *table)

358

{

358

{

359

struct mmu_table_batch **batch = &tlb->batch;

359

struct mmu_table_batch **batch = &tlb->batch;

360

361

tlb->need_flush = 1;

361

tlb->need_flush = 1;

362

363

/*

363

/*

364

* When there's less then two users of this mm there cannot be a

364

* When there's less then two users of this mm there cannot be a

365

* concurrent page-table walk.

365

* concurrent page-table walk.

366

*/

366

*/

367

if (atomic_read(&tlb->mm->mm_users) < 2) {

367

if (atomic_read(&tlb->mm->mm_users) < 2) {

368

__tlb_remove_table(table);

368

__tlb_remove_table(table);

369

return;

369

return;

370

}

370

}

371

372

if (*batch == NULL) {

372

if (*batch == NULL) {

373

*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);

373

*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);

374

if (*batch == NULL) {

374

if (*batch == NULL) {

375

tlb_remove_table_one(table);

375

tlb_remove_table_one(table);

376

return;

376

return;

377

}

377

}

378

(*batch)->nr = 0;

378

(*batch)->nr = 0;

379

}

379

}

380

(*batch)->tables[(*batch)->nr++] = table;

380

(*batch)->tables[(*batch)->nr++] = table;

381

if ((*batch)->nr == MAX_TABLE_BATCH)

381

if ((*batch)->nr == MAX_TABLE_BATCH)

382

tlb_table_flush(tlb);

382

tlb_table_flush(tlb);

383

}

383

}

384

385

#endif /* CONFIG_HAVE_RCU_TABLE_FREE */

385

#endif /* CONFIG_HAVE_RCU_TABLE_FREE */

386

387

/*

387

/*

388

* If a p?d_bad entry is found while walking page tables, report

388

* If a p?d_bad entry is found while walking page tables, report

389

* the error, before resetting entry to p?d_none. Usually (but

389

* the error, before resetting entry to p?d_none. Usually (but

390

* very seldom) called out from the p?d_none_or_clear_bad macros.

390

* very seldom) called out from the p?d_none_or_clear_bad macros.

391

*/

391

*/

392

393

void pgd_clear_bad(pgd_t *pgd)

393

void pgd_clear_bad(pgd_t *pgd)

394

{

394

{

395

pgd_ERROR(*pgd);

395

pgd_ERROR(*pgd);

396

pgd_clear(pgd);

396

pgd_clear(pgd);

397

}

397

}

398

399

void pud_clear_bad(pud_t *pud)

399

void pud_clear_bad(pud_t *pud)

400

{

400

{

401

pud_ERROR(*pud);

401

pud_ERROR(*pud);

402

pud_clear(pud);

402

pud_clear(pud);

403

}

403

}

404

405

void pmd_clear_bad(pmd_t *pmd)

405

void pmd_clear_bad(pmd_t *pmd)

406

{

406

{

407

pmd_ERROR(*pmd);

407

pmd_ERROR(*pmd);

408

pmd_clear(pmd);

408

pmd_clear(pmd);

409

}

409

}

410

411

/*

411

/*

412

* Note: this doesn't free the actual pages themselves. That

412

* Note: this doesn't free the actual pages themselves. That

413

* has been handled earlier when unmapping all the memory regions.

413

* has been handled earlier when unmapping all the memory regions.

414

*/

414

*/

415

static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,

415

static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,

416

unsigned long addr)

416

unsigned long addr)

417

{

417

{

418

pgtable_t token = pmd_pgtable(*pmd);

418

pgtable_t token = pmd_pgtable(*pmd);

419

pmd_clear(pmd);

419

pmd_clear(pmd);

420

pte_free_tlb(tlb, token, addr);

420

pte_free_tlb(tlb, token, addr);

421

tlb->mm->nr_ptes--;

421

tlb->mm->nr_ptes--;

422

}

422

}

423

424

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,

424

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,

425

unsigned long addr, unsigned long end,

425

unsigned long addr, unsigned long end,

426

unsigned long floor, unsigned long ceiling)

426

unsigned long floor, unsigned long ceiling)

427

{

427

{

428

pmd_t *pmd;

428

pmd_t *pmd;

429

unsigned long next;

429

unsigned long next;

430

unsigned long start;

430

unsigned long start;

431

432

start = addr;

432

start = addr;

433

pmd = pmd_offset(pud, addr);

433

pmd = pmd_offset(pud, addr);

434

do {

434

do {

435

next = pmd_addr_end(addr, end);

435

next = pmd_addr_end(addr, end);

436

if (pmd_none_or_clear_bad(pmd))

436

if (pmd_none_or_clear_bad(pmd))

437

continue;

437

continue;

438

free_pte_range(tlb, pmd, addr);

438

free_pte_range(tlb, pmd, addr);

439

} while (pmd++, addr = next, addr != end);

439

} while (pmd++, addr = next, addr != end);

440

441

start &= PUD_MASK;

441

start &= PUD_MASK;

442

if (start < floor)

442

if (start < floor)

443

return;

443

return;

444

if (ceiling) {

444

if (ceiling) {

445

ceiling &= PUD_MASK;

445

ceiling &= PUD_MASK;

446

if (!ceiling)

446

if (!ceiling)

447

return;

447

return;

448

}

448

}

449

if (end - 1 > ceiling - 1)

449

if (end - 1 > ceiling - 1)

450

return;

450

return;

451

452

pmd = pmd_offset(pud, start);

452

pmd = pmd_offset(pud, start);

453

pud_clear(pud);

453

pud_clear(pud);

454

pmd_free_tlb(tlb, pmd, start);

454

pmd_free_tlb(tlb, pmd, start);

455

}

455

}

456

457

static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,

457

static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,

458

unsigned long addr, unsigned long end,

458

unsigned long addr, unsigned long end,

459

unsigned long floor, unsigned long ceiling)

459

unsigned long floor, unsigned long ceiling)

460

{

460

{

461

pud_t *pud;

461

pud_t *pud;

462

unsigned long next;

462

unsigned long next;

463

unsigned long start;

463

unsigned long start;

464

465

start = addr;

465

start = addr;

466

pud = pud_offset(pgd, addr);

466

pud = pud_offset(pgd, addr);

467

do {

467

do {

468

next = pud_addr_end(addr, end);

468

next = pud_addr_end(addr, end);

469

if (pud_none_or_clear_bad(pud))

469

if (pud_none_or_clear_bad(pud))

470

continue;

470

continue;

471

free_pmd_range(tlb, pud, addr, next, floor, ceiling);

471

free_pmd_range(tlb, pud, addr, next, floor, ceiling);

472

} while (pud++, addr = next, addr != end);

472

} while (pud++, addr = next, addr != end);

473

474

start &= PGDIR_MASK;

474

start &= PGDIR_MASK;

475

if (start < floor)

475

if (start < floor)

476

return;

476

return;

477

if (ceiling) {

477

if (ceiling) {

478

ceiling &= PGDIR_MASK;

478

ceiling &= PGDIR_MASK;

479

if (!ceiling)

479

if (!ceiling)

480

return;

480

return;

481

}

481

}

482

if (end - 1 > ceiling - 1)

482

if (end - 1 > ceiling - 1)

483

return;

483

return;

484

485

pud = pud_offset(pgd, start);

485

pud = pud_offset(pgd, start);

486

pgd_clear(pgd);

486

pgd_clear(pgd);

487

pud_free_tlb(tlb, pud, start);

487

pud_free_tlb(tlb, pud, start);

488

}

488

}

489

490

/*

490

/*

491

* This function frees user-level page tables of a process.

491

* This function frees user-level page tables of a process.

492

*

492

*

493

* Must be called with pagetable lock held.

493

* Must be called with pagetable lock held.

494

*/

494

*/

495

void free_pgd_range(struct mmu_gather *tlb,

495

void free_pgd_range(struct mmu_gather *tlb,

496

unsigned long addr, unsigned long end,

496

unsigned long addr, unsigned long end,

497

unsigned long floor, unsigned long ceiling)

497

unsigned long floor, unsigned long ceiling)

498

{

498

{

499

pgd_t *pgd;

499

pgd_t *pgd;

500

unsigned long next;

500

unsigned long next;

501

502

/*

502

/*

503

* The next few lines have given us lots of grief...

503

* The next few lines have given us lots of grief...

504

*

504

*

505

* Why are we testing PMD* at this top level? Because often

505

* Why are we testing PMD* at this top level? Because often

506

* there will be no work to do at all, and we'd prefer not to

506

* there will be no work to do at all, and we'd prefer not to

507

* go all the way down to the bottom just to discover that.

507

* go all the way down to the bottom just to discover that.

508

*

508

*

509

* Why all these "- 1"s? Because 0 represents both the bottom

509

* Why all these "- 1"s? Because 0 represents both the bottom

510

* of the address space and the top of it (using -1 for the

510

* of the address space and the top of it (using -1 for the

511

* top wouldn't help much: the masks would do the wrong thing).

511

* top wouldn't help much: the masks would do the wrong thing).

512

* The rule is that addr 0 and floor 0 refer to the bottom of

512

* The rule is that addr 0 and floor 0 refer to the bottom of

513

* the address space, but end 0 and ceiling 0 refer to the top

513

* the address space, but end 0 and ceiling 0 refer to the top

514

* Comparisons need to use "end - 1" and "ceiling - 1" (though

514

* Comparisons need to use "end - 1" and "ceiling - 1" (though

515

* that end 0 case should be mythical).

515

* that end 0 case should be mythical).

516

*

516

*

517

* Wherever addr is brought up or ceiling brought down, we must

517

* Wherever addr is brought up or ceiling brought down, we must

518

* be careful to reject "the opposite 0" before it confuses the

518

* be careful to reject "the opposite 0" before it confuses the

519

* subsequent tests. But what about where end is brought down

519

* subsequent tests. But what about where end is brought down

520

* by PMD_SIZE below? no, end can't go down to 0 there.

520

* by PMD_SIZE below? no, end can't go down to 0 there.

521

*

521

*

522

* Whereas we round start (addr) and ceiling down, by different

522

* Whereas we round start (addr) and ceiling down, by different

523

* masks at different levels, in order to test whether a table

523

* masks at different levels, in order to test whether a table

524

* now has no other vmas using it, so can be freed, we don't

524

* now has no other vmas using it, so can be freed, we don't

525

* bother to round floor or end up - the tests don't need that.

525

* bother to round floor or end up - the tests don't need that.

526

*/

526

*/

527

528

addr &= PMD_MASK;

528

addr &= PMD_MASK;

529

if (addr < floor) {

529

if (addr < floor) {

530

addr += PMD_SIZE;

530

addr += PMD_SIZE;

531

if (!addr)

531

if (!addr)

532

return;

532

return;

533

}

533

}

534

if (ceiling) {

534

if (ceiling) {

535

ceiling &= PMD_MASK;

535

ceiling &= PMD_MASK;

536

if (!ceiling)

536

if (!ceiling)

537

return;

537

return;

538

}

538

}

539

if (end - 1 > ceiling - 1)

539

if (end - 1 > ceiling - 1)

540

end -= PMD_SIZE;

540

end -= PMD_SIZE;

541

if (addr > end - 1)

541

if (addr > end - 1)

542

return;

542

return;

543

544

pgd = pgd_offset(tlb->mm, addr);

544

pgd = pgd_offset(tlb->mm, addr);

545

do {

545

do {

546

next = pgd_addr_end(addr, end);

546

next = pgd_addr_end(addr, end);

547

if (pgd_none_or_clear_bad(pgd))

547

if (pgd_none_or_clear_bad(pgd))

548

continue;

548

continue;

549

free_pud_range(tlb, pgd, addr, next, floor, ceiling);

549

free_pud_range(tlb, pgd, addr, next, floor, ceiling);

550

} while (pgd++, addr = next, addr != end);

550

} while (pgd++, addr = next, addr != end);

551

}

551

}

552

553

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,

553

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,

554

unsigned long floor, unsigned long ceiling)

554

unsigned long floor, unsigned long ceiling)

555

{

555

{

556

while (vma) {

556

while (vma) {

557

struct vm_area_struct *next = vma->vm_next;

557

struct vm_area_struct *next = vma->vm_next;

558

unsigned long addr = vma->vm_start;

558

unsigned long addr = vma->vm_start;

559

560

/*

560

/*

561

* Hide vma from rmap and truncate_pagecache before freeing

561

* Hide vma from rmap and truncate_pagecache before freeing

562

* pgtables

562

* pgtables

563

*/

563

*/

564

unlink_anon_vmas(vma);

564

unlink_anon_vmas(vma);

565

unlink_file_vma(vma);

565

unlink_file_vma(vma);

566

567

if (is_vm_hugetlb_page(vma)) {

567

if (is_vm_hugetlb_page(vma)) {

568

hugetlb_free_pgd_range(tlb, addr, vma->vm_end,

568

hugetlb_free_pgd_range(tlb, addr, vma->vm_end,

569

floor, next? next->vm_start: ceiling);

569

floor, next? next->vm_start: ceiling);

570

} else {

570

} else {

571

/*

571

/*

572

* Optimization: gather nearby vmas into one call down

572

* Optimization: gather nearby vmas into one call down

573

*/

573

*/

574

while (next && next->vm_start <= vma->vm_end + PMD_SIZE

574

while (next && next->vm_start <= vma->vm_end + PMD_SIZE

575

&& !is_vm_hugetlb_page(next)) {

575

&& !is_vm_hugetlb_page(next)) {

576

vma = next;

576

vma = next;

577

next = vma->vm_next;

577

next = vma->vm_next;

578

unlink_anon_vmas(vma);

578

unlink_anon_vmas(vma);

579

unlink_file_vma(vma);

579

unlink_file_vma(vma);

580

}

580

}

581

free_pgd_range(tlb, addr, vma->vm_end,

581

free_pgd_range(tlb, addr, vma->vm_end,

582

floor, next? next->vm_start: ceiling);

582

floor, next? next->vm_start: ceiling);

583

}

583

}

584

vma = next;

584

vma = next;

585

}

585

}

586

}

586

}

587

588

int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,

588

int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,

589

pmd_t *pmd, unsigned long address)

589

pmd_t *pmd, unsigned long address)

590

{

590

{

591

pgtable_t new = pte_alloc_one(mm, address);

591

pgtable_t new = pte_alloc_one(mm, address);

592

int wait_split_huge_page;

592

int wait_split_huge_page;

593

if (!new)

593

if (!new)

594

return -ENOMEM;

594

return -ENOMEM;

595

596

/*

596

/*

597

* Ensure all pte setup (eg. pte page lock and page clearing) are

597

* Ensure all pte setup (eg. pte page lock and page clearing) are

598

* visible before the pte is made visible to other CPUs by being

598

* visible before the pte is made visible to other CPUs by being

599

* put into page tables.

599

* put into page tables.

600

*

600

*

601

* The other side of the story is the pointer chasing in the page

601

* The other side of the story is the pointer chasing in the page

602

* table walking code (when walking the page table without locking;

602

* table walking code (when walking the page table without locking;

603

* ie. most of the time). Fortunately, these data accesses consist

603

* ie. most of the time). Fortunately, these data accesses consist

604

* of a chain of data-dependent loads, meaning most CPUs (alpha

604

* of a chain of data-dependent loads, meaning most CPUs (alpha

605

* being the notable exception) will already guarantee loads are

605

* being the notable exception) will already guarantee loads are

606

* seen in-order. See the alpha page table accessors for the

606

* seen in-order. See the alpha page table accessors for the

607

* smp_read_barrier_depends() barriers in page table walking code.

607

* smp_read_barrier_depends() barriers in page table walking code.

608

*/

608

*/

609

smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

609

smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

610

611

spin_lock(&mm->page_table_lock);

611

spin_lock(&mm->page_table_lock);

612

wait_split_huge_page = 0;

612

wait_split_huge_page = 0;

613

if (likely(pmd_none(*pmd))) { /* Has another populated it ? */

613

if (likely(pmd_none(*pmd))) { /* Has another populated it ? */

614

mm->nr_ptes++;

614

mm->nr_ptes++;

615

pmd_populate(mm, pmd, new);

615

pmd_populate(mm, pmd, new);

616

new = NULL;

616

new = NULL;

617

} else if (unlikely(pmd_trans_splitting(*pmd)))

617

} else if (unlikely(pmd_trans_splitting(*pmd)))

618

wait_split_huge_page = 1;

618

wait_split_huge_page = 1;

619

spin_unlock(&mm->page_table_lock);

619

spin_unlock(&mm->page_table_lock);

620

if (new)

620

if (new)

621

pte_free(mm, new);

621

pte_free(mm, new);

622

if (wait_split_huge_page)

622

if (wait_split_huge_page)

623

wait_split_huge_page(vma->anon_vma, pmd);

623

wait_split_huge_page(vma->anon_vma, pmd);

624

return 0;

624

return 0;

625

}

625

}

626

627

int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)

627

int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)

628

{

628

{

629

pte_t *new = pte_alloc_one_kernel(&init_mm, address);

629

pte_t *new = pte_alloc_one_kernel(&init_mm, address);

630

if (!new)

630

if (!new)

631

return -ENOMEM;

631

return -ENOMEM;

632

633

smp_wmb(); /* See comment in __pte_alloc */

633

smp_wmb(); /* See comment in __pte_alloc */

634

635

spin_lock(&init_mm.page_table_lock);

635

spin_lock(&init_mm.page_table_lock);

636

if (likely(pmd_none(*pmd))) { /* Has another populated it ? */

636

if (likely(pmd_none(*pmd))) { /* Has another populated it ? */

637

pmd_populate_kernel(&init_mm, pmd, new);

637

pmd_populate_kernel(&init_mm, pmd, new);

638

new = NULL;

638

new = NULL;

639

} else

639

} else

640

VM_BUG_ON(pmd_trans_splitting(*pmd));

640

VM_BUG_ON(pmd_trans_splitting(*pmd));

641

spin_unlock(&init_mm.page_table_lock);

641

spin_unlock(&init_mm.page_table_lock);

642

if (new)

642

if (new)

643

pte_free_kernel(&init_mm, new);

643

pte_free_kernel(&init_mm, new);

644

return 0;

644

return 0;

645

}

645

}

646

647

static inline void init_rss_vec(int *rss)

647

static inline void init_rss_vec(int *rss)

648

{

648

{

649

memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);

649

memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);

650

}

650

}

651

652

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)

652

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)

653

{

653

{

654

int i;

654

int i;

655

656

if (current->mm == mm)

656

if (current->mm == mm)

657

sync_mm_rss(mm);

657

sync_mm_rss(mm);

658

for (i = 0; i < NR_MM_COUNTERS; i++)

658

for (i = 0; i < NR_MM_COUNTERS; i++)

659

if (rss[i])

659

if (rss[i])

660

add_mm_counter(mm, i, rss[i]);

660

add_mm_counter(mm, i, rss[i]);

661

}

661

}

662

663

/*

663

/*

664

* This function is called to print an error when a bad pte

664

* This function is called to print an error when a bad pte

665

* is found. For example, we might have a PFN-mapped pte in

665

* is found. For example, we might have a PFN-mapped pte in

666

* a region that doesn't allow it.

666

* a region that doesn't allow it.

667

*

667

*

668

* The calling function must still handle the error.

668

* The calling function must still handle the error.

669

*/

669

*/

670

static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,

670

static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,

671

pte_t pte, struct page *page)

671

pte_t pte, struct page *page)

672

{

672

{

673

pgd_t *pgd = pgd_offset(vma->vm_mm, addr);

673

pgd_t *pgd = pgd_offset(vma->vm_mm, addr);

674

pud_t *pud = pud_offset(pgd, addr);

674

pud_t *pud = pud_offset(pgd, addr);

675

pmd_t *pmd = pmd_offset(pud, addr);

675

pmd_t *pmd = pmd_offset(pud, addr);

676

struct address_space *mapping;

676

struct address_space *mapping;

677

pgoff_t index;

677

pgoff_t index;

678

static unsigned long resume;

678

static unsigned long resume;

679

static unsigned long nr_shown;

679

static unsigned long nr_shown;

680

static unsigned long nr_unshown;

680

static unsigned long nr_unshown;

681

682

/*

682

/*

683

* Allow a burst of 60 reports, then keep quiet for that minute;

683

* Allow a burst of 60 reports, then keep quiet for that minute;

684

* or allow a steady drip of one report per second.

684

* or allow a steady drip of one report per second.

685

*/

685

*/

686

if (nr_shown == 60) {

686

if (nr_shown == 60) {

687

if (time_before(jiffies, resume)) {

687

if (time_before(jiffies, resume)) {

688

nr_unshown++;

688

nr_unshown++;

689

return;

689

return;

690

}

690

}

691

if (nr_unshown) {

691

if (nr_unshown) {

692

printk(KERN_ALERT

692

printk(KERN_ALERT

693

"BUG: Bad page map: %lu messages suppressed\n",

693

"BUG: Bad page map: %lu messages suppressed\n",

694

nr_unshown);

694

nr_unshown);

695

nr_unshown = 0;

695

nr_unshown = 0;

696

}

696

}

697

nr_shown = 0;

697

nr_shown = 0;

698

}

698

}

699

if (nr_shown++ == 0)

699

if (nr_shown++ == 0)

700

resume = jiffies + 60 * HZ;

700

resume = jiffies + 60 * HZ;

701

702

mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;

702

mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;

703

index = linear_page_index(vma, addr);

703

index = linear_page_index(vma, addr);

704

705

printk(KERN_ALERT

705

printk(KERN_ALERT

706

"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",

706

"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",

707

current->comm,

707

current->comm,

708

(long long)pte_val(pte), (long long)pmd_val(*pmd));

708

(long long)pte_val(pte), (long long)pmd_val(*pmd));

709

if (page)

709

if (page)

710

dump_page(page);

710

dump_page(page);

711

printk(KERN_ALERT

711

printk(KERN_ALERT

712

"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",

712

"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",

713

(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);

713

(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);

714

/*

714

/*

715

* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y

715

* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y

716

*/

716

*/

717

if (vma->vm_ops)

717

if (vma->vm_ops)

718

print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",

718

print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",

719

(unsigned long)vma->vm_ops->fault);

719

(unsigned long)vma->vm_ops->fault);

720

if (vma->vm_file && vma->vm_file->f_op)

720

if (vma->vm_file && vma->vm_file->f_op)

721

print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",

721

print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",

722

(unsigned long)vma->vm_file->f_op->mmap);

722

(unsigned long)vma->vm_file->f_op->mmap);

723

dump_stack();

723

dump_stack();

724

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

724

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

725

}

725

}

726

727

static inline bool is_cow_mapping(vm_flags_t flags)

727

static inline bool is_cow_mapping(vm_flags_t flags)

728

{

728

{

729

return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

729

return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

730

}

730

}

731

732

/*

732

/*

733

* vm_normal_page -- This function gets the "struct page" associated with a pte.

733

* vm_normal_page -- This function gets the "struct page" associated with a pte.

734

*

734

*

735

* "Special" mappings do not wish to be associated with a "struct page" (either

735

* "Special" mappings do not wish to be associated with a "struct page" (either

736

* it doesn't exist, or it exists but they don't want to touch it). In this

736

* it doesn't exist, or it exists but they don't want to touch it). In this

737

* case, NULL is returned here. "Normal" mappings do have a struct page.

737

* case, NULL is returned here. "Normal" mappings do have a struct page.

738

*

738

*

739

* There are 2 broad cases. Firstly, an architecture may define a pte_special()

739

* There are 2 broad cases. Firstly, an architecture may define a pte_special()

740

* pte bit, in which case this function is trivial. Secondly, an architecture

740

* pte bit, in which case this function is trivial. Secondly, an architecture

741

* may not have a spare pte bit, which requires a more complicated scheme,

741

* may not have a spare pte bit, which requires a more complicated scheme,

742

* described below.

742

* described below.

743

*

743

*

744

* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a

744

* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a

745

* special mapping (even if there are underlying and valid "struct pages").

745

* special mapping (even if there are underlying and valid "struct pages").

746

* COWed pages of a VM_PFNMAP are always normal.

746

* COWed pages of a VM_PFNMAP are always normal.

747

*

747

*

748

* The way we recognize COWed pages within VM_PFNMAP mappings is through the

748

* The way we recognize COWed pages within VM_PFNMAP mappings is through the

749

* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit

749

* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit

750

* set, and the vm_pgoff will point to the first PFN mapped: thus every special

750

* set, and the vm_pgoff will point to the first PFN mapped: thus every special

751

* mapping will always honor the rule

751

* mapping will always honor the rule

752

*

752

*

753

* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)

753

* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)

754

*

754

*

755

* And for normal mappings this is false.

755

* And for normal mappings this is false.

756

*

756

*

757

* This restricts such mappings to be a linear translation from virtual address

757

* This restricts such mappings to be a linear translation from virtual address

758

* to pfn. To get around this restriction, we allow arbitrary mappings so long

758

* to pfn. To get around this restriction, we allow arbitrary mappings so long

759

* as the vma is not a COW mapping; in that case, we know that all ptes are

759

* as the vma is not a COW mapping; in that case, we know that all ptes are

760

* special (because none can have been COWed).

760

* special (because none can have been COWed).

761

*

761

*

762

*

762

*

763

* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.

763

* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.

764

*

764

*

765

* VM_MIXEDMAP mappings can likewise contain memory with or without "struct

765

* VM_MIXEDMAP mappings can likewise contain memory with or without "struct

766

* page" backing, however the difference is that _all_ pages with a struct

766

* page" backing, however the difference is that _all_ pages with a struct

767

* page (that is, those where pfn_valid is true) are refcounted and considered

767

* page (that is, those where pfn_valid is true) are refcounted and considered

768

* normal pages by the VM. The disadvantage is that pages are refcounted

768

* normal pages by the VM. The disadvantage is that pages are refcounted

769

* (which can be slower and simply not an option for some PFNMAP users). The

769

* (which can be slower and simply not an option for some PFNMAP users). The

770

* advantage is that we don't have to follow the strict linearity rule of

770

* advantage is that we don't have to follow the strict linearity rule of

771

* PFNMAP mappings in order to support COWable mappings.

771

* PFNMAP mappings in order to support COWable mappings.

772

*

772

*

773

*/

773

*/

774

#ifdef __HAVE_ARCH_PTE_SPECIAL

774

#ifdef __HAVE_ARCH_PTE_SPECIAL

775

# define HAVE_PTE_SPECIAL 1

775

# define HAVE_PTE_SPECIAL 1

776

#else

776

#else

777

# define HAVE_PTE_SPECIAL 0

777

# define HAVE_PTE_SPECIAL 0

778

#endif

778

#endif

779

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,

779

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,

780

pte_t pte)

780

pte_t pte)

781

{

781

{

782

unsigned long pfn = pte_pfn(pte);

782

unsigned long pfn = pte_pfn(pte);

783

784

if (HAVE_PTE_SPECIAL) {

784

if (HAVE_PTE_SPECIAL) {

785

if (likely(!pte_special(pte)))

785

if (likely(!pte_special(pte)))

786

goto check_pfn;

786

goto check_pfn;

787

if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))

787

if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))

788

return NULL;

788

return NULL;

789

if (!is_zero_pfn(pfn))

789

if (!is_zero_pfn(pfn))

790

print_bad_pte(vma, addr, pte, NULL);

790

print_bad_pte(vma, addr, pte, NULL);

791

return NULL;

791

return NULL;

792

}

792

}

793

794

/* !HAVE_PTE_SPECIAL case follows: */

794

/* !HAVE_PTE_SPECIAL case follows: */

795

796

if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {

796

if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {

797

if (vma->vm_flags & VM_MIXEDMAP) {

797

if (vma->vm_flags & VM_MIXEDMAP) {

798

if (!pfn_valid(pfn))

798

if (!pfn_valid(pfn))

799

return NULL;

799

return NULL;

800

goto out;

800

goto out;

801

} else {

801

} else {

802

unsigned long off;

802

unsigned long off;

803

off = (addr - vma->vm_start) >> PAGE_SHIFT;

803

off = (addr - vma->vm_start) >> PAGE_SHIFT;

804

if (pfn == vma->vm_pgoff + off)

804

if (pfn == vma->vm_pgoff + off)

805

return NULL;

805

return NULL;

806

if (!is_cow_mapping(vma->vm_flags))

806

if (!is_cow_mapping(vma->vm_flags))

807

return NULL;

807

return NULL;

808

}

808

}

809

}

809

}

810

811

if (is_zero_pfn(pfn))

811

if (is_zero_pfn(pfn))

812

return NULL;

812

return NULL;

813

check_pfn:

813

check_pfn:

814

if (unlikely(pfn > highest_memmap_pfn)) {

814

if (unlikely(pfn > highest_memmap_pfn)) {

815

print_bad_pte(vma, addr, pte, NULL);

815

print_bad_pte(vma, addr, pte, NULL);

816

return NULL;

816

return NULL;

817

}

817

}

818

819

/*

819

/*

820

* NOTE! We still have PageReserved() pages in the page tables.

820

* NOTE! We still have PageReserved() pages in the page tables.

821

* eg. VDSO mappings can cause them to exist.

821

* eg. VDSO mappings can cause them to exist.

822

*/

822

*/

823

out:

823

out:

824

return pfn_to_page(pfn);

824

return pfn_to_page(pfn);

825

}

825

}

826

827

/*

827

/*

828

* copy one vm_area from one task to the other. Assumes the page tables

828

* copy one vm_area from one task to the other. Assumes the page tables

829

* already present in the new task to be cleared in the whole range

829

* already present in the new task to be cleared in the whole range

830

* covered by this vma.

830

* covered by this vma.

831

*/

831

*/

832

833

static inline unsigned long

833

static inline unsigned long

834

copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,

834

copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,

835

pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,

835

pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,

836

unsigned long addr, int *rss)

836

unsigned long addr, int *rss)

837

{

837

{

838

unsigned long vm_flags = vma->vm_flags;

838

unsigned long vm_flags = vma->vm_flags;

839

pte_t pte = *src_pte;

839

pte_t pte = *src_pte;

840

struct page *page;

840

struct page *page;

841

842

/* pte contains position in swap or file, so copy. */

842

/* pte contains position in swap or file, so copy. */

843

if (unlikely(!pte_present(pte))) {

843

if (unlikely(!pte_present(pte))) {

844

if (!pte_file(pte)) {

844

if (!pte_file(pte)) {

845

swp_entry_t entry = pte_to_swp_entry(pte);

845

swp_entry_t entry = pte_to_swp_entry(pte);

846

847

if (swap_duplicate(entry) < 0)

847

if (swap_duplicate(entry) < 0)

848

return entry.val;

848

return entry.val;

849

850

/* make sure dst_mm is on swapoff's mmlist. */

850

/* make sure dst_mm is on swapoff's mmlist. */

851

if (unlikely(list_empty(&dst_mm->mmlist))) {

851

if (unlikely(list_empty(&dst_mm->mmlist))) {

852

spin_lock(&mmlist_lock);

852

spin_lock(&mmlist_lock);

853

if (list_empty(&dst_mm->mmlist))

853

if (list_empty(&dst_mm->mmlist))

854

list_add(&dst_mm->mmlist,

854

list_add(&dst_mm->mmlist,

855

&src_mm->mmlist);

855

&src_mm->mmlist);

856

spin_unlock(&mmlist_lock);

856

spin_unlock(&mmlist_lock);

857

}

857

}

858

if (likely(!non_swap_entry(entry)))

858

if (likely(!non_swap_entry(entry)))

859

rss[MM_SWAPENTS]++;

859

rss[MM_SWAPENTS]++;

860

else if (is_migration_entry(entry)) {

860

else if (is_migration_entry(entry)) {

861

page = migration_entry_to_page(entry);

861

page = migration_entry_to_page(entry);

862

863

if (PageAnon(page))

863

if (PageAnon(page))

864

rss[MM_ANONPAGES]++;

864

rss[MM_ANONPAGES]++;

865

else

865

else

866

rss[MM_FILEPAGES]++;

866

rss[MM_FILEPAGES]++;

867

868

if (is_write_migration_entry(entry) &&

868

if (is_write_migration_entry(entry) &&

869

is_cow_mapping(vm_flags)) {

869

is_cow_mapping(vm_flags)) {

870

/*

870

/*

871

* COW mappings require pages in both

871

* COW mappings require pages in both

872

* parent and child to be set to read.

872

* parent and child to be set to read.

873

*/

873

*/

874

make_migration_entry_read(&entry);

874

make_migration_entry_read(&entry);

875

pte = swp_entry_to_pte(entry);

875

pte = swp_entry_to_pte(entry);

876

set_pte_at(src_mm, addr, src_pte, pte);

876

set_pte_at(src_mm, addr, src_pte, pte);

877

}

877

}

878

}

878

}

879

}

879

}

880

goto out_set_pte;

880

goto out_set_pte;

881

}

881

}

882

883

/*

883

/*

884

* If it's a COW mapping, write protect it both

884

* If it's a COW mapping, write protect it both

885

* in the parent and the child

885

* in the parent and the child

886

*/

886

*/

887

if (is_cow_mapping(vm_flags)) {

887

if (is_cow_mapping(vm_flags)) {

888

ptep_set_wrprotect(src_mm, addr, src_pte);

888

ptep_set_wrprotect(src_mm, addr, src_pte);

889

pte = pte_wrprotect(pte);

889

pte = pte_wrprotect(pte);

890

}

890

}

891

892

/*

892

/*

893

* If it's a shared mapping, mark it clean in

893

* If it's a shared mapping, mark it clean in

894

* the child

894

* the child

895

*/

895

*/

896

if (vm_flags & VM_SHARED)

896

if (vm_flags & VM_SHARED)

897

pte = pte_mkclean(pte);

897

pte = pte_mkclean(pte);

898

pte = pte_mkold(pte);

898

pte = pte_mkold(pte);

899

900

page = vm_normal_page(vma, addr, pte);

900

page = vm_normal_page(vma, addr, pte);

901

if (page) {

901

if (page) {

902

get_page(page);

902

get_page(page);

903

page_dup_rmap(page);

903

page_dup_rmap(page);

904

if (PageAnon(page))

904

if (PageAnon(page))

905

rss[MM_ANONPAGES]++;

905

rss[MM_ANONPAGES]++;

906

else

906

else

907

rss[MM_FILEPAGES]++;

907

rss[MM_FILEPAGES]++;

908

}

908

}

909

910

out_set_pte:

910

out_set_pte:

911

set_pte_at(dst_mm, addr, dst_pte, pte);

911

set_pte_at(dst_mm, addr, dst_pte, pte);

912

return 0;

912

return 0;

913

}

913

}

914

915

int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

915

int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

916

pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,

916

pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,

917

unsigned long addr, unsigned long end)

917

unsigned long addr, unsigned long end)

918

{

918

{

919

pte_t *orig_src_pte, *orig_dst_pte;

919

pte_t *orig_src_pte, *orig_dst_pte;

920

pte_t *src_pte, *dst_pte;

920

pte_t *src_pte, *dst_pte;

921

spinlock_t *src_ptl, *dst_ptl;

921

spinlock_t *src_ptl, *dst_ptl;

922

int progress = 0;

922

int progress = 0;

923

int rss[NR_MM_COUNTERS];

923

int rss[NR_MM_COUNTERS];

924

swp_entry_t entry = (swp_entry_t){0};

924

swp_entry_t entry = (swp_entry_t){0};

925

926

again:

926

again:

927

init_rss_vec(rss);

927

init_rss_vec(rss);

928

929

dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);

929

dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);

930

if (!dst_pte)

930

if (!dst_pte)

931

return -ENOMEM;

931

return -ENOMEM;

932

src_pte = pte_offset_map(src_pmd, addr);

932

src_pte = pte_offset_map(src_pmd, addr);

933

src_ptl = pte_lockptr(src_mm, src_pmd);

933

src_ptl = pte_lockptr(src_mm, src_pmd);

934

spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

934

spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

935

orig_src_pte = src_pte;

935

orig_src_pte = src_pte;

936

orig_dst_pte = dst_pte;

936

orig_dst_pte = dst_pte;

937

arch_enter_lazy_mmu_mode();

937

arch_enter_lazy_mmu_mode();

938

939

do {

939

do {

940

/*

940

/*

941

* We are holding two locks at this point - either of them

941

* We are holding two locks at this point - either of them

942

* could generate latencies in another task on another CPU.

942

* could generate latencies in another task on another CPU.

943

*/

943

*/

944

if (progress >= 32) {

944

if (progress >= 32) {

945

progress = 0;

945

progress = 0;

946

if (need_resched() ||

946

if (need_resched() ||

947

spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))

947

spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))

948

break;

948

break;

949

}

949

}

950

if (pte_none(*src_pte)) {

950

if (pte_none(*src_pte)) {

951

progress++;

951

progress++;

952

continue;

952

continue;

953

}

953

}

954

entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,

954

entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,

955

vma, addr, rss);

955

vma, addr, rss);

956

if (entry.val)

956

if (entry.val)

957

break;

957

break;

958

progress += 8;

958

progress += 8;

959

} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

959

} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

960

961

arch_leave_lazy_mmu_mode();

961

arch_leave_lazy_mmu_mode();

962

spin_unlock(src_ptl);

962

spin_unlock(src_ptl);

963

pte_unmap(orig_src_pte);

963

pte_unmap(orig_src_pte);

964

add_mm_rss_vec(dst_mm, rss);

964

add_mm_rss_vec(dst_mm, rss);

965

pte_unmap_unlock(orig_dst_pte, dst_ptl);

965

pte_unmap_unlock(orig_dst_pte, dst_ptl);

966

cond_resched();

966

cond_resched();

967

968

if (entry.val) {

968

if (entry.val) {

969

if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)

969

if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)

970

return -ENOMEM;

970

return -ENOMEM;

971

progress = 0;

971

progress = 0;

972

}

972

}

973

if (addr != end)

973

if (addr != end)

974

goto again;

974

goto again;

975

return 0;

975

return 0;

976

}

976

}

977

978

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

978

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

979

pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,

979

pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,

980

unsigned long addr, unsigned long end)

980

unsigned long addr, unsigned long end)

981

{

981

{

982

pmd_t *src_pmd, *dst_pmd;

982

pmd_t *src_pmd, *dst_pmd;

983

unsigned long next;

983

unsigned long next;

984

985

dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);

985

dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);

986

if (!dst_pmd)

986

if (!dst_pmd)

987

return -ENOMEM;

987

return -ENOMEM;

988

src_pmd = pmd_offset(src_pud, addr);

988

src_pmd = pmd_offset(src_pud, addr);

989

do {

989

do {

990

next = pmd_addr_end(addr, end);

990

next = pmd_addr_end(addr, end);

991

if (pmd_trans_huge(*src_pmd)) {

991

if (pmd_trans_huge(*src_pmd)) {

992

int err;

992

int err;

993

VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);

993

VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);

994

err = copy_huge_pmd(dst_mm, src_mm,

994

err = copy_huge_pmd(dst_mm, src_mm,

995

dst_pmd, src_pmd, addr, vma);

995

dst_pmd, src_pmd, addr, vma);

996

if (err == -ENOMEM)

996

if (err == -ENOMEM)

997

return -ENOMEM;

997

return -ENOMEM;

998

if (!err)

998

if (!err)

999

continue;

999

continue;

1000

/* fall through */

1000

/* fall through */

1001

}

1001

}

1002

if (pmd_none_or_clear_bad(src_pmd))

1002

if (pmd_none_or_clear_bad(src_pmd))

1003

continue;

1003

continue;

1004

if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,

1004

if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,

1005

vma, addr, next))

1005

vma, addr, next))

1006

return -ENOMEM;

1006

return -ENOMEM;

1007

} while (dst_pmd++, src_pmd++, addr = next, addr != end);

1007

} while (dst_pmd++, src_pmd++, addr = next, addr != end);

1008

return 0;

1008

return 0;

1009

}

1009

}

1010

1011

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

1011

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

1012

pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,

1012

pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,

1013

unsigned long addr, unsigned long end)

1013

unsigned long addr, unsigned long end)

1014

{

1014

{

1015

pud_t *src_pud, *dst_pud;

1015

pud_t *src_pud, *dst_pud;

1016

unsigned long next;

1016

unsigned long next;

1017

1018

dst_pud = pud_alloc(dst_mm, dst_pgd, addr);

1018

dst_pud = pud_alloc(dst_mm, dst_pgd, addr);

1019

if (!dst_pud)

1019

if (!dst_pud)

1020

return -ENOMEM;

1020

return -ENOMEM;

1021

src_pud = pud_offset(src_pgd, addr);

1021

src_pud = pud_offset(src_pgd, addr);

1022

do {

1022

do {

1023

next = pud_addr_end(addr, end);

1023

next = pud_addr_end(addr, end);

1024

if (pud_none_or_clear_bad(src_pud))

1024

if (pud_none_or_clear_bad(src_pud))

1025

continue;

1025

continue;

1026

if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,

1026

if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,

1027

vma, addr, next))

1027

vma, addr, next))

1028

return -ENOMEM;

1028

return -ENOMEM;

1029

} while (dst_pud++, src_pud++, addr = next, addr != end);

1029

} while (dst_pud++, src_pud++, addr = next, addr != end);

1030

return 0;

1030

return 0;

1031

}

1031

}

1032

1033

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

1033

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

1034

struct vm_area_struct *vma)

1034

struct vm_area_struct *vma)

1035

{

1035

{

1036

pgd_t *src_pgd, *dst_pgd;

1036

pgd_t *src_pgd, *dst_pgd;

1037

unsigned long next;

1037

unsigned long next;

1038

unsigned long addr = vma->vm_start;

1038

unsigned long addr = vma->vm_start;

1039

unsigned long end = vma->vm_end;

1039

unsigned long end = vma->vm_end;

1040

unsigned long mmun_start; /* For mmu_notifiers */

1040

unsigned long mmun_start; /* For mmu_notifiers */

1041

unsigned long mmun_end; /* For mmu_notifiers */

1041

unsigned long mmun_end; /* For mmu_notifiers */

1042

bool is_cow;

1042

bool is_cow;

1043

int ret;

1043

int ret;

1044

1045

/*

1045

/*

1046

* Don't copy ptes where a page fault will fill them correctly.

1046

* Don't copy ptes where a page fault will fill them correctly.

1047

* Fork becomes much lighter when there are big shared or private

1047

* Fork becomes much lighter when there are big shared or private

1048

* readonly mappings. The tradeoff is that copy_page_range is more

1048

* readonly mappings. The tradeoff is that copy_page_range is more

1049

* efficient than faulting.

1049

* efficient than faulting.

1050

*/

1050

*/

1051

if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |

1051

if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |

1052

VM_PFNMAP | VM_MIXEDMAP))) {

1052

VM_PFNMAP | VM_MIXEDMAP))) {

1053

if (!vma->anon_vma)

1053

if (!vma->anon_vma)

1054

return 0;

1054

return 0;

1055

}

1055

}

1056

1057

if (is_vm_hugetlb_page(vma))

1057

if (is_vm_hugetlb_page(vma))

1058

return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1058

return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1059

1060

if (unlikely(vma->vm_flags & VM_PFNMAP)) {

1060

if (unlikely(vma->vm_flags & VM_PFNMAP)) {

1061

/*

1061

/*

1062

* We do not free on error cases below as remove_vma

1062

* We do not free on error cases below as remove_vma

1063

* gets called on error from higher level routine

1063

* gets called on error from higher level routine

1064

*/

1064

*/

1065

ret = track_pfn_copy(vma);

1065

ret = track_pfn_copy(vma);

1066

if (ret)

1066

if (ret)

1067

return ret;

1067

return ret;

1068

}

1068

}

1069

1070

/*

1070

/*

1071

* We need to invalidate the secondary MMU mappings only when

1071

* We need to invalidate the secondary MMU mappings only when

1072

* there could be a permission downgrade on the ptes of the

1072

* there could be a permission downgrade on the ptes of the

1073

* parent mm. And a permission downgrade will only happen if

1073

* parent mm. And a permission downgrade will only happen if

1074

* is_cow_mapping() returns true.

1074

* is_cow_mapping() returns true.

1075

*/

1075

*/

1076

is_cow = is_cow_mapping(vma->vm_flags);

1076

is_cow = is_cow_mapping(vma->vm_flags);

1077

mmun_start = addr;

1077

mmun_start = addr;

1078

mmun_end = end;

1078

mmun_end = end;

1079

if (is_cow)

1079

if (is_cow)

1080

mmu_notifier_invalidate_range_start(src_mm, mmun_start,

1080

mmu_notifier_invalidate_range_start(src_mm, mmun_start,

1081

mmun_end);

1081

mmun_end);

1082

1083

ret = 0;

1083

ret = 0;

1084

dst_pgd = pgd_offset(dst_mm, addr);

1084

dst_pgd = pgd_offset(dst_mm, addr);

1085

src_pgd = pgd_offset(src_mm, addr);

1085

src_pgd = pgd_offset(src_mm, addr);

1086

do {

1086

do {

1087

next = pgd_addr_end(addr, end);

1087

next = pgd_addr_end(addr, end);

1088

if (pgd_none_or_clear_bad(src_pgd))

1088

if (pgd_none_or_clear_bad(src_pgd))

1089

continue;

1089

continue;

1090

if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,

1090

if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,

1091

vma, addr, next))) {

1091

vma, addr, next))) {

1092

ret = -ENOMEM;

1092

ret = -ENOMEM;

1093

break;

1093

break;

1094

}

1094

}

1095

} while (dst_pgd++, src_pgd++, addr = next, addr != end);

1095

} while (dst_pgd++, src_pgd++, addr = next, addr != end);

1096

1097

if (is_cow)

1097

if (is_cow)

1098

mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);

1098

mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);

1099

return ret;

1099

return ret;

1100

}

1100

}

1101

1102

static unsigned long zap_pte_range(struct mmu_gather *tlb,

1102

static unsigned long zap_pte_range(struct mmu_gather *tlb,

1103

struct vm_area_struct *vma, pmd_t *pmd,

1103

struct vm_area_struct *vma, pmd_t *pmd,

1104

unsigned long addr, unsigned long end,

1104

unsigned long addr, unsigned long end,

1105

struct zap_details *details)

1105

struct zap_details *details)

1106

{

1106

{

1107

struct mm_struct *mm = tlb->mm;

1107

struct mm_struct *mm = tlb->mm;

1108

int force_flush = 0;

1108

int force_flush = 0;

1109

int rss[NR_MM_COUNTERS];

1109

int rss[NR_MM_COUNTERS];

1110

spinlock_t *ptl;

1110

spinlock_t *ptl;

1111

pte_t *start_pte;

1111

pte_t *start_pte;

1112

pte_t *pte;

1112

pte_t *pte;

1113

1114

again:

1114

again:

1115

init_rss_vec(rss);

1115

init_rss_vec(rss);

1116

start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);

1116

start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);

1117

pte = start_pte;

1117

pte = start_pte;

1118

arch_enter_lazy_mmu_mode();

1118

arch_enter_lazy_mmu_mode();

1119

do {

1119

do {

1120

pte_t ptent = *pte;

1120

pte_t ptent = *pte;

1121

if (pte_none(ptent)) {

1121

if (pte_none(ptent)) {

1122

continue;

1122

continue;

1123

}

1123

}

1124

1125

if (pte_present(ptent)) {

1125

if (pte_present(ptent)) {

1126

struct page *page;

1126

struct page *page;

1127

1128

page = vm_normal_page(vma, addr, ptent);

1128

page = vm_normal_page(vma, addr, ptent);

1129

if (unlikely(details) && page) {

1129

if (unlikely(details) && page) {

1130

/*

1130

/*

1131

* unmap_shared_mapping_pages() wants to

1131

* unmap_shared_mapping_pages() wants to

1132

* invalidate cache without truncating:

1132

* invalidate cache without truncating:

1133

* unmap shared but keep private pages.

1133

* unmap shared but keep private pages.

1134

*/

1134

*/

1135

if (details->check_mapping &&

1135

if (details->check_mapping &&

1136

details->check_mapping != page->mapping)

1136

details->check_mapping != page->mapping)

1137

continue;

1137

continue;

1138

/*

1138

/*

1139

* Each page->index must be checked when

1139

* Each page->index must be checked when

1140

* invalidating or truncating nonlinear.

1140

* invalidating or truncating nonlinear.

1141

*/

1141

*/

1142

if (details->nonlinear_vma &&

1142

if (details->nonlinear_vma &&

1143

(page->index < details->first_index ||

1143

(page->index < details->first_index ||

1144

page->index > details->last_index))

1144

page->index > details->last_index))

1145

continue;

1145

continue;

1146

}

1146

}

1147

ptent = ptep_get_and_clear_full(mm, addr, pte,

1147

ptent = ptep_get_and_clear_full(mm, addr, pte,

1148

tlb->fullmm);

1148

tlb->fullmm);

1149

tlb_remove_tlb_entry(tlb, pte, addr);

1149

tlb_remove_tlb_entry(tlb, pte, addr);

1150

if (unlikely(!page))

1150

if (unlikely(!page))

1151

continue;

1151

continue;

1152

if (unlikely(details) && details->nonlinear_vma

1152

if (unlikely(details) && details->nonlinear_vma

1153

&& linear_page_index(details->nonlinear_vma,

1153

&& linear_page_index(details->nonlinear_vma,

1154

addr) != page->index)

1154

addr) != page->index)

1155

set_pte_at(mm, addr, pte,

1155

set_pte_at(mm, addr, pte,

1156

pgoff_to_pte(page->index));

1156

pgoff_to_pte(page->index));

1157

if (PageAnon(page))

1157

if (PageAnon(page))

1158

rss[MM_ANONPAGES]--;

1158

rss[MM_ANONPAGES]--;

1159

else {

1159

else {

1160

if (pte_dirty(ptent))

1160

if (pte_dirty(ptent))

1161

set_page_dirty(page);

1161

set_page_dirty(page);

1162

if (pte_young(ptent) &&

1162

if (pte_young(ptent) &&

1163

likely(!VM_SequentialReadHint(vma)))

1163

likely(!VM_SequentialReadHint(vma)))

1164

mark_page_accessed(page);

1164

mark_page_accessed(page);

1165

rss[MM_FILEPAGES]--;

1165

rss[MM_FILEPAGES]--;

1166

}

1166

}

1167

page_remove_rmap(page);

1167

page_remove_rmap(page);

1168

if (unlikely(page_mapcount(page) < 0))

1168

if (unlikely(page_mapcount(page) < 0))

1169

print_bad_pte(vma, addr, ptent, page);

1169

print_bad_pte(vma, addr, ptent, page);

1170

force_flush = !__tlb_remove_page(tlb, page);

1170

force_flush = !__tlb_remove_page(tlb, page);

1171

if (force_flush)

1171

if (force_flush)

1172

break;

1172

break;

1173

continue;

1173

continue;

1174

}

1174

}

1175

/*

1175

/*

1176

* If details->check_mapping, we leave swap entries;

1176

* If details->check_mapping, we leave swap entries;

1177

* if details->nonlinear_vma, we leave file entries.

1177

* if details->nonlinear_vma, we leave file entries.

1178

*/

1178

*/

1179

if (unlikely(details))

1179

if (unlikely(details))

1180

continue;

1180

continue;

1181

if (pte_file(ptent)) {

1181

if (pte_file(ptent)) {

1182

if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))

1182

if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))

1183

print_bad_pte(vma, addr, ptent, NULL);

1183

print_bad_pte(vma, addr, ptent, NULL);

1184

} else {

1184

} else {

1185

swp_entry_t entry = pte_to_swp_entry(ptent);

1185

swp_entry_t entry = pte_to_swp_entry(ptent);

1186

1187

if (!non_swap_entry(entry))

1187

if (!non_swap_entry(entry))

1188

rss[MM_SWAPENTS]--;

1188

rss[MM_SWAPENTS]--;

1189

else if (is_migration_entry(entry)) {

1189

else if (is_migration_entry(entry)) {

1190

struct page *page;

1190

struct page *page;

1191

1192

page = migration_entry_to_page(entry);

1192

page = migration_entry_to_page(entry);

1193

1194

if (PageAnon(page))

1194

if (PageAnon(page))

1195

rss[MM_ANONPAGES]--;

1195

rss[MM_ANONPAGES]--;

1196

else

1196

else

1197

rss[MM_FILEPAGES]--;

1197

rss[MM_FILEPAGES]--;

1198

}

1198

}

1199

if (unlikely(!free_swap_and_cache(entry)))

1199

if (unlikely(!free_swap_and_cache(entry)))

1200

print_bad_pte(vma, addr, ptent, NULL);

1200

print_bad_pte(vma, addr, ptent, NULL);

1201

}

1201

}

1202

pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);

1202

pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);

1203

} while (pte++, addr += PAGE_SIZE, addr != end);

1203

} while (pte++, addr += PAGE_SIZE, addr != end);

1204

1205

add_mm_rss_vec(mm, rss);

1205

add_mm_rss_vec(mm, rss);

1206

arch_leave_lazy_mmu_mode();

1206

arch_leave_lazy_mmu_mode();

1207

pte_unmap_unlock(start_pte, ptl);

1207

pte_unmap_unlock(start_pte, ptl);

1208

1209

/*

1209

/*

1210

* mmu_gather ran out of room to batch pages, we break out of

1210

* mmu_gather ran out of room to batch pages, we break out of

1211

* the PTE lock to avoid doing the potential expensive TLB invalidate

1211

* the PTE lock to avoid doing the potential expensive TLB invalidate

1212

* and page-free while holding it.

1212

* and page-free while holding it.

1213

*/

1213

*/

1214

if (force_flush) {

1214

if (force_flush) {

1215

force_flush = 0;

1215

force_flush = 0;

1216

1217

#ifdef HAVE_GENERIC_MMU_GATHER

1217

#ifdef HAVE_GENERIC_MMU_GATHER

1218

tlb->start = addr;

1218

tlb->start = addr;

1219

tlb->end = end;

1219

tlb->end = end;

1220

#endif

1220

#endif

1221

tlb_flush_mmu(tlb);

1221

tlb_flush_mmu(tlb);

1222

if (addr != end)

1222

if (addr != end)

1223

goto again;

1223

goto again;

1224

}

1224

}

1225

1226

return addr;

1226

return addr;

1227

}

1227

}

1228

1229

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,

1229

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,

1230

struct vm_area_struct *vma, pud_t *pud,

1230

struct vm_area_struct *vma, pud_t *pud,

1231

unsigned long addr, unsigned long end,

1231

unsigned long addr, unsigned long end,

1232

struct zap_details *details)

1232

struct zap_details *details)

1233

{

1233

{

1234

pmd_t *pmd;

1234

pmd_t *pmd;

1235

unsigned long next;

1235

unsigned long next;

1236

1237

pmd = pmd_offset(pud, addr);

1237

pmd = pmd_offset(pud, addr);

1238

do {

1238

do {

1239

next = pmd_addr_end(addr, end);

1239

next = pmd_addr_end(addr, end);

1240

if (pmd_trans_huge(*pmd)) {

1240

if (pmd_trans_huge(*pmd)) {

1241

if (next - addr != HPAGE_PMD_SIZE) {

1241

if (next - addr != HPAGE_PMD_SIZE) {

1242

#ifdef CONFIG_DEBUG_VM

1242

#ifdef CONFIG_DEBUG_VM

1243

if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {

1243

if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {

1244

pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",

1244

pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",

1245

__func__, addr, end,

1245

__func__, addr, end,

1246

vma->vm_start,

1246

vma->vm_start,

1247

vma->vm_end);

1247

vma->vm_end);

1248

BUG();

1248

BUG();

1249

}

1249

}

1250

#endif

1250

#endif

1251

split_huge_page_pmd(vma, addr, pmd);

1251

split_huge_page_pmd(vma, addr, pmd);

1252

} else if (zap_huge_pmd(tlb, vma, pmd, addr))

1252

} else if (zap_huge_pmd(tlb, vma, pmd, addr))

1253

goto next;

1253

goto next;

1254

/* fall through */

1254

/* fall through */

1255

}

1255

}

1256

/*

1256

/*

1257

* Here there can be other concurrent MADV_DONTNEED or

1257

* Here there can be other concurrent MADV_DONTNEED or

1258

* trans huge page faults running, and if the pmd is

1258

* trans huge page faults running, and if the pmd is

1259

* none or trans huge it can change under us. This is

1259

* none or trans huge it can change under us. This is

1260

* because MADV_DONTNEED holds the mmap_sem in read

1260

* because MADV_DONTNEED holds the mmap_sem in read

1261

* mode.

1261

* mode.

1262

*/

1262

*/

1263

if (pmd_none_or_trans_huge_or_clear_bad(pmd))

1263

if (pmd_none_or_trans_huge_or_clear_bad(pmd))

1264

goto next;

1264

goto next;

1265

next = zap_pte_range(tlb, vma, pmd, addr, next, details);

1265

next = zap_pte_range(tlb, vma, pmd, addr, next, details);

1266

cond_resched();

1267

cond_resched();

1268

} while (pmd++, addr = next, addr != end);

1268

} while (pmd++, addr = next, addr != end);

1269

1270

return addr;

1270

return addr;

1271

}

1271

}

1272

1273

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,

1273

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,

1274

struct vm_area_struct *vma, pgd_t *pgd,

1274

struct vm_area_struct *vma, pgd_t *pgd,

1275

unsigned long addr, unsigned long end,

1275

unsigned long addr, unsigned long end,

1276

struct zap_details *details)

1276

struct zap_details *details)

1277

{

1277

{

1278

pud_t *pud;

1278

pud_t *pud;

1279

unsigned long next;

1279

unsigned long next;

1280

1281

pud = pud_offset(pgd, addr);

1281

pud = pud_offset(pgd, addr);

1282

do {

1282

do {

1283

next = pud_addr_end(addr, end);

1283

next = pud_addr_end(addr, end);

1284

if (pud_none_or_clear_bad(pud))

1284

if (pud_none_or_clear_bad(pud))

1285

continue;

1285

continue;

1286

next = zap_pmd_range(tlb, vma, pud, addr, next, details);

1286

next = zap_pmd_range(tlb, vma, pud, addr, next, details);

1287

} while (pud++, addr = next, addr != end);

1287

} while (pud++, addr = next, addr != end);

1288

1289

return addr;

1289

return addr;

1290

}

1290

}

1291

1292

static void unmap_page_range(struct mmu_gather *tlb,

1292

static void unmap_page_range(struct mmu_gather *tlb,

1293

struct vm_area_struct *vma,

1293

struct vm_area_struct *vma,

1294

unsigned long addr, unsigned long end,

1294

unsigned long addr, unsigned long end,

1295

struct zap_details *details)

1295

struct zap_details *details)

1296

{

1296

{

1297

pgd_t *pgd;

1297

pgd_t *pgd;

1298

unsigned long next;

1298

unsigned long next;

1299

1300

if (details && !details->check_mapping && !details->nonlinear_vma)

1300

if (details && !details->check_mapping && !details->nonlinear_vma)

1301

details = NULL;

1301

details = NULL;

1302

1303

BUG_ON(addr >= end);

1303

BUG_ON(addr >= end);

1304

mem_cgroup_uncharge_start();

1304

mem_cgroup_uncharge_start();

1305

tlb_start_vma(tlb, vma);

1305

tlb_start_vma(tlb, vma);

1306

pgd = pgd_offset(vma->vm_mm, addr);

1306

pgd = pgd_offset(vma->vm_mm, addr);

1307

do {

1307

do {

1308

next = pgd_addr_end(addr, end);

1308

next = pgd_addr_end(addr, end);

1309

if (pgd_none_or_clear_bad(pgd))

1309

if (pgd_none_or_clear_bad(pgd))

1310

continue;

1310

continue;

1311

next = zap_pud_range(tlb, vma, pgd, addr, next, details);

1311

next = zap_pud_range(tlb, vma, pgd, addr, next, details);

1312

} while (pgd++, addr = next, addr != end);

1312

} while (pgd++, addr = next, addr != end);

1313

tlb_end_vma(tlb, vma);

1313

tlb_end_vma(tlb, vma);

1314

mem_cgroup_uncharge_end();

1314

mem_cgroup_uncharge_end();

1315

}

1315

}

1316

1317

1318

static void unmap_single_vma(struct mmu_gather *tlb,

1318

static void unmap_single_vma(struct mmu_gather *tlb,

1319

struct vm_area_struct *vma, unsigned long start_addr,

1319

struct vm_area_struct *vma, unsigned long start_addr,

1320

unsigned long end_addr,

1320

unsigned long end_addr,

1321

struct zap_details *details)

1321

struct zap_details *details)

1322

{

1322

{

1323

unsigned long start = max(vma->vm_start, start_addr);

1323

unsigned long start = max(vma->vm_start, start_addr);

1324

unsigned long end;

1324

unsigned long end;

1325

1326

if (start >= vma->vm_end)

1326

if (start >= vma->vm_end)

1327

return;

1327

return;

1328

end = min(vma->vm_end, end_addr);

1328

end = min(vma->vm_end, end_addr);

1329

if (end <= vma->vm_start)

1329

if (end <= vma->vm_start)

1330

return;

1330

return;

1331

1332

if (vma->vm_file)

1332

if (vma->vm_file)

1333

uprobe_munmap(vma, start, end);

1333

uprobe_munmap(vma, start, end);

1334

1335

if (unlikely(vma->vm_flags & VM_PFNMAP))

1335

if (unlikely(vma->vm_flags & VM_PFNMAP))

1336

untrack_pfn(vma, 0, 0);

1336

untrack_pfn(vma, 0, 0);

1337

1338

if (start != end) {

1338

if (start != end) {

1339

if (unlikely(is_vm_hugetlb_page(vma))) {

1339

if (unlikely(is_vm_hugetlb_page(vma))) {

1340

/*

1340

/*

1341

* It is undesirable to test vma->vm_file as it

1341

* It is undesirable to test vma->vm_file as it

1342

* should be non-null for valid hugetlb area.

1342

* should be non-null for valid hugetlb area.

1343

* However, vm_file will be NULL in the error

1343

* However, vm_file will be NULL in the error

1344

* cleanup path of do_mmap_pgoff. When

1344

* cleanup path of do_mmap_pgoff. When

1345

* hugetlbfs ->mmap method fails,

1345

* hugetlbfs ->mmap method fails,

1346

* do_mmap_pgoff() nullifies vma->vm_file

1346

* do_mmap_pgoff() nullifies vma->vm_file

1347

* before calling this function to clean up.

1347

* before calling this function to clean up.

1348

* Since no pte has actually been setup, it is

1348

* Since no pte has actually been setup, it is

1349

* safe to do nothing in this case.

1349

* safe to do nothing in this case.

1350

*/

1350

*/

1351

if (vma->vm_file) {

1351

if (vma->vm_file) {

1352

mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);

1352

mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);

1353

__unmap_hugepage_range_final(tlb, vma, start, end, NULL);

1353

__unmap_hugepage_range_final(tlb, vma, start, end, NULL);

1354

mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);

1354

mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);

1355

}

1355

}

1356

} else

1356

} else

1357

unmap_page_range(tlb, vma, start, end, details);

1357

unmap_page_range(tlb, vma, start, end, details);

1358

}

1358

}

1359

}

1359

}

1360

1361

/**

1361

/**

1362

* unmap_vmas - unmap a range of memory covered by a list of vma's

1362

* unmap_vmas - unmap a range of memory covered by a list of vma's

1363

* @tlb: address of the caller's struct mmu_gather

1363

* @tlb: address of the caller's struct mmu_gather

1364

* @vma: the starting vma

1364

* @vma: the starting vma

1365

* @start_addr: virtual address at which to start unmapping

1365

* @start_addr: virtual address at which to start unmapping

1366

* @end_addr: virtual address at which to end unmapping

1366

* @end_addr: virtual address at which to end unmapping

1367

*

1367

*

1368

* Unmap all pages in the vma list.

1368

* Unmap all pages in the vma list.

1369

*

1369

*

1370

* Only addresses between `start' and `end' will be unmapped.

1370

* Only addresses between `start' and `end' will be unmapped.

1371

*

1371

*

1372

* The VMA list must be sorted in ascending virtual address order.

1372

* The VMA list must be sorted in ascending virtual address order.

1373

*

1373

*

1374

* unmap_vmas() assumes that the caller will flush the whole unmapped address

1374

* unmap_vmas() assumes that the caller will flush the whole unmapped address

1375

* range after unmap_vmas() returns. So the only responsibility here is to

1375

* range after unmap_vmas() returns. So the only responsibility here is to

1376

* ensure that any thus-far unmapped pages are flushed before unmap_vmas()

1376

* ensure that any thus-far unmapped pages are flushed before unmap_vmas()

1377

* drops the lock and schedules.

1377

* drops the lock and schedules.

1378

*/

1378

*/

1379

void unmap_vmas(struct mmu_gather *tlb,

1379

void unmap_vmas(struct mmu_gather *tlb,

1380

struct vm_area_struct *vma, unsigned long start_addr,

1380

struct vm_area_struct *vma, unsigned long start_addr,

1381

unsigned long end_addr)

1381

unsigned long end_addr)

1382

{

1382

{

1383

struct mm_struct *mm = vma->vm_mm;

1383

struct mm_struct *mm = vma->vm_mm;

1384

1385

mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);

1385

mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);

1386

for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)

1386

for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)

1387

unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);

1387

unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);

1388

mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);

1388

mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);

1389

}

1389

}

1390

1391

/**

1391

/**

1392

* zap_page_range - remove user pages in a given range

1392

* zap_page_range - remove user pages in a given range

1393

* @vma: vm_area_struct holding the applicable pages

1393

* @vma: vm_area_struct holding the applicable pages

1394

* @start: starting address of pages to zap

1394

* @start: starting address of pages to zap

1395

* @size: number of bytes to zap

1395

* @size: number of bytes to zap

1396

* @details: details of nonlinear truncation or shared cache invalidation

1396

* @details: details of nonlinear truncation or shared cache invalidation

1397

*

1397

*

1398

* Caller must protect the VMA list

1398

* Caller must protect the VMA list

1399

*/

1399

*/

1400

void zap_page_range(struct vm_area_struct *vma, unsigned long start,

1400

void zap_page_range(struct vm_area_struct *vma, unsigned long start,

1401

unsigned long size, struct zap_details *details)

1401

unsigned long size, struct zap_details *details)

1402

{

1402

{

1403

struct mm_struct *mm = vma->vm_mm;

1403

struct mm_struct *mm = vma->vm_mm;

1404

struct mmu_gather tlb;

1404

struct mmu_gather tlb;

1405

unsigned long end = start + size;

1405

unsigned long end = start + size;

1406

1407

lru_add_drain();

1407

lru_add_drain();

1408

tlb_gather_mmu(&tlb, mm, 0);

1408

tlb_gather_mmu(&tlb, mm, 0);

1409

update_hiwater_rss(mm);

1409

update_hiwater_rss(mm);

1410

mmu_notifier_invalidate_range_start(mm, start, end);

1410

mmu_notifier_invalidate_range_start(mm, start, end);

1411

for ( ; vma && vma->vm_start < end; vma = vma->vm_next)

1411

for ( ; vma && vma->vm_start < end; vma = vma->vm_next)

1412

unmap_single_vma(&tlb, vma, start, end, details);

1412

unmap_single_vma(&tlb, vma, start, end, details);

1413

mmu_notifier_invalidate_range_end(mm, start, end);

1413

mmu_notifier_invalidate_range_end(mm, start, end);

1414

tlb_finish_mmu(&tlb, start, end);

1414

tlb_finish_mmu(&tlb, start, end);

1415

}

1415

}

1416

1417

/**

1417

/**

1418

* zap_page_range_single - remove user pages in a given range

1418

* zap_page_range_single - remove user pages in a given range

1419

* @vma: vm_area_struct holding the applicable pages

1419

* @vma: vm_area_struct holding the applicable pages

1420

* @address: starting address of pages to zap

1420

* @address: starting address of pages to zap

1421

* @size: number of bytes to zap

1421

* @size: number of bytes to zap

1422

* @details: details of nonlinear truncation or shared cache invalidation

1422

* @details: details of nonlinear truncation or shared cache invalidation

1423

*

1423

*

1424

* The range must fit into one VMA.

1424

* The range must fit into one VMA.

1425

*/

1425

*/

1426

static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,

1426

static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,

1427

unsigned long size, struct zap_details *details)

1427

unsigned long size, struct zap_details *details)

1428

{

1428

{

1429

struct mm_struct *mm = vma->vm_mm;

1429

struct mm_struct *mm = vma->vm_mm;

1430

struct mmu_gather tlb;

1430

struct mmu_gather tlb;

1431

unsigned long end = address + size;

1431

unsigned long end = address + size;

1432

1433

lru_add_drain();

1433

lru_add_drain();

1434

tlb_gather_mmu(&tlb, mm, 0);

1434

tlb_gather_mmu(&tlb, mm, 0);

1435

update_hiwater_rss(mm);

1435

update_hiwater_rss(mm);

1436

mmu_notifier_invalidate_range_start(mm, address, end);

1436

mmu_notifier_invalidate_range_start(mm, address, end);

1437

unmap_single_vma(&tlb, vma, address, end, details);

1437

unmap_single_vma(&tlb, vma, address, end, details);

1438

mmu_notifier_invalidate_range_end(mm, address, end);

1438

mmu_notifier_invalidate_range_end(mm, address, end);

1439

tlb_finish_mmu(&tlb, address, end);

1439

tlb_finish_mmu(&tlb, address, end);

1440

}

1440

}

1441

1442

/**

1442

/**

1443

* zap_vma_ptes - remove ptes mapping the vma

1443

* zap_vma_ptes - remove ptes mapping the vma

1444

* @vma: vm_area_struct holding ptes to be zapped

1444

* @vma: vm_area_struct holding ptes to be zapped

1445

* @address: starting address of pages to zap

1445

* @address: starting address of pages to zap

1446

* @size: number of bytes to zap

1446

* @size: number of bytes to zap

1447

*

1447

*

1448

* This function only unmaps ptes assigned to VM_PFNMAP vmas.

1448

* This function only unmaps ptes assigned to VM_PFNMAP vmas.

1449

*

1449

*

1450

* The entire address range must be fully contained within the vma.

1450

* The entire address range must be fully contained within the vma.

1451

*

1451

*

1452

* Returns 0 if successful.

1452

* Returns 0 if successful.

1453

*/

1453

*/

1454

int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,

1454

int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,

1455

unsigned long size)

1455

unsigned long size)

1456

{

1456

{

1457

if (address < vma->vm_start || address + size > vma->vm_end ||

1457

if (address < vma->vm_start || address + size > vma->vm_end ||

1458

!(vma->vm_flags & VM_PFNMAP))

1458

!(vma->vm_flags & VM_PFNMAP))

1459

return -1;

1459

return -1;

1460

zap_page_range_single(vma, address, size, NULL);

1460

zap_page_range_single(vma, address, size, NULL);

1461

return 0;

1461

return 0;

1462

}

1462

}

1463

EXPORT_SYMBOL_GPL(zap_vma_ptes);

1463

EXPORT_SYMBOL_GPL(zap_vma_ptes);

1464

1465

/**

1465

/**

1466

* follow_page_mask - look up a page descriptor from a user-virtual address

1466

* follow_page_mask - look up a page descriptor from a user-virtual address

1467

* @vma: vm_area_struct mapping @address

1467

* @vma: vm_area_struct mapping @address

1468

* @address: virtual address to look up

1468

* @address: virtual address to look up

1469

* @flags: flags modifying lookup behaviour

1469

* @flags: flags modifying lookup behaviour

1470

* @page_mask: on output, *page_mask is set according to the size of the page

1470

* @page_mask: on output, *page_mask is set according to the size of the page

1471

*

1471

*

1472

* @flags can have FOLL_ flags set, defined in <linux/mm.h>

1472

* @flags can have FOLL_ flags set, defined in <linux/mm.h>

1473

*

1473

*

1474

* Returns the mapped (struct page *), %NULL if no mapping exists, or

1474

* Returns the mapped (struct page *), %NULL if no mapping exists, or

1475

* an error pointer if there is a mapping to something not represented

1475

* an error pointer if there is a mapping to something not represented

1476

* by a page descriptor (see also vm_normal_page()).

1476

* by a page descriptor (see also vm_normal_page()).

1477

*/

1477

*/

1478

struct page *follow_page_mask(struct vm_area_struct *vma,

1478

struct page *follow_page_mask(struct vm_area_struct *vma,

1479

unsigned long address, unsigned int flags,

1479

unsigned long address, unsigned int flags,

1480

unsigned int *page_mask)

1480

unsigned int *page_mask)

1481

{

1481

{

1482

pgd_t *pgd;

1482

pgd_t *pgd;

1483

pud_t *pud;

1483

pud_t *pud;

1484

pmd_t *pmd;

1484

pmd_t *pmd;

1485

pte_t *ptep, pte;

1485

pte_t *ptep, pte;

1486

spinlock_t *ptl;

1486

spinlock_t *ptl;

1487

struct page *page;

1487

struct page *page;

1488

struct mm_struct *mm = vma->vm_mm;

1488

struct mm_struct *mm = vma->vm_mm;

1489

1490

*page_mask = 0;

1490

*page_mask = 0;

1491

1492

page = follow_huge_addr(mm, address, flags & FOLL_WRITE);

1492

page = follow_huge_addr(mm, address, flags & FOLL_WRITE);

1493

if (!IS_ERR(page)) {

1493

if (!IS_ERR(page)) {

1494

BUG_ON(flags & FOLL_GET);

1494

BUG_ON(flags & FOLL_GET);

1495

goto out;

1495

goto out;

1496

}

1496

}

1497

1498

page = NULL;

1498

page = NULL;

1499

pgd = pgd_offset(mm, address);

1499

pgd = pgd_offset(mm, address);

1500

if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))

1500

if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))

1501

goto no_page_table;

1501

goto no_page_table;

1502

1503

pud = pud_offset(pgd, address);

1503

pud = pud_offset(pgd, address);

1504

if (pud_none(*pud))

1504

if (pud_none(*pud))

1505

goto no_page_table;

1505

goto no_page_table;

1506

if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {

1506

if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {

1507

BUG_ON(flags & FOLL_GET);

1507

BUG_ON(flags & FOLL_GET);

1508

page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);

1508

page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);

1509

goto out;

1509

goto out;

1510

}

1510

}

1511

if (unlikely(pud_bad(*pud)))

1511

if (unlikely(pud_bad(*pud)))

1512

goto no_page_table;

1512

goto no_page_table;

1513

1514

pmd = pmd_offset(pud, address);

1514

pmd = pmd_offset(pud, address);

1515

if (pmd_none(*pmd))

1515

if (pmd_none(*pmd))

1516

goto no_page_table;

1516

goto no_page_table;

1517

if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {

1517

if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {

1518

BUG_ON(flags & FOLL_GET);

1518

BUG_ON(flags & FOLL_GET);

1519

page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);

1519

page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);

1520

goto out;

1520

goto out;

1521

}

1521

}

1522

if ((flags & FOLL_NUMA) && pmd_numa(*pmd))

1522

if ((flags & FOLL_NUMA) && pmd_numa(*pmd))

1523

goto no_page_table;

1523

goto no_page_table;

1524

if (pmd_trans_huge(*pmd)) {

1524

if (pmd_trans_huge(*pmd)) {

1525

if (flags & FOLL_SPLIT) {

1525

if (flags & FOLL_SPLIT) {

1526

split_huge_page_pmd(vma, address, pmd);

1526

split_huge_page_pmd(vma, address, pmd);

1527

goto split_fallthrough;

1527

goto split_fallthrough;

1528

}

1528

}

1529

spin_lock(&mm->page_table_lock);

1529

spin_lock(&mm->page_table_lock);

1530

if (likely(pmd_trans_huge(*pmd))) {

1530

if (likely(pmd_trans_huge(*pmd))) {

1531

if (unlikely(pmd_trans_splitting(*pmd))) {

1531

if (unlikely(pmd_trans_splitting(*pmd))) {

1532

spin_unlock(&mm->page_table_lock);

1532

spin_unlock(&mm->page_table_lock);

1533

wait_split_huge_page(vma->anon_vma, pmd);

1533

wait_split_huge_page(vma->anon_vma, pmd);

1534

} else {

1534

} else {

1535

page = follow_trans_huge_pmd(vma, address,

1535

page = follow_trans_huge_pmd(vma, address,

1536

pmd, flags);

1536

pmd, flags);

1537

spin_unlock(&mm->page_table_lock);

1537

spin_unlock(&mm->page_table_lock);

1538

*page_mask = HPAGE_PMD_NR - 1;

1538

*page_mask = HPAGE_PMD_NR - 1;

1539

goto out;

1539

goto out;

1540

}

1540

}

1541

} else

1541

} else

1542

spin_unlock(&mm->page_table_lock);

1542

spin_unlock(&mm->page_table_lock);

1543

/* fall through */

1543

/* fall through */

1544

}

1544

}

1545

split_fallthrough:

1545

split_fallthrough:

1546

if (unlikely(pmd_bad(*pmd)))

1546

if (unlikely(pmd_bad(*pmd)))

1547

goto no_page_table;

1547

goto no_page_table;

1548

1549

ptep = pte_offset_map_lock(mm, pmd, address, &ptl);

1549

ptep = pte_offset_map_lock(mm, pmd, address, &ptl);

1550

1551

pte = *ptep;

1551

pte = *ptep;

1552

if (!pte_present(pte)) {

1552

if (!pte_present(pte)) {

1553

swp_entry_t entry;

1553

swp_entry_t entry;

1554

/*

1554

/*

1555

* KSM's break_ksm() relies upon recognizing a ksm page

1555

* KSM's break_ksm() relies upon recognizing a ksm page

1556

* even while it is being migrated, so for that case we

1556

* even while it is being migrated, so for that case we

1557

* need migration_entry_wait().

1557

* need migration_entry_wait().

1558

*/

1558

*/

1559

if (likely(!(flags & FOLL_MIGRATION)))

1559

if (likely(!(flags & FOLL_MIGRATION)))

1560

goto no_page;

1560

goto no_page;

1561

if (pte_none(pte) || pte_file(pte))

1561

if (pte_none(pte) || pte_file(pte))

1562

goto no_page;

1562

goto no_page;

1563

entry = pte_to_swp_entry(pte);

1563

entry = pte_to_swp_entry(pte);

1564

if (!is_migration_entry(entry))

1564

if (!is_migration_entry(entry))

1565

goto no_page;

1565

goto no_page;

1566

pte_unmap_unlock(ptep, ptl);

1566

pte_unmap_unlock(ptep, ptl);

1567

migration_entry_wait(mm, pmd, address);

1567

migration_entry_wait(mm, pmd, address);

1568

goto split_fallthrough;

1568

goto split_fallthrough;

1569

}

1569

}

1570

if ((flags & FOLL_NUMA) && pte_numa(pte))

1570

if ((flags & FOLL_NUMA) && pte_numa(pte))

1571

goto no_page;

1571

goto no_page;

1572

if ((flags & FOLL_WRITE) && !pte_write(pte))

1572

if ((flags & FOLL_WRITE) && !pte_write(pte))

1573

goto unlock;

1573

goto unlock;

1574

1575

page = vm_normal_page(vma, address, pte);

1575

page = vm_normal_page(vma, address, pte);

1576

if (unlikely(!page)) {

1576

if (unlikely(!page)) {

1577

if ((flags & FOLL_DUMP) ||

1577

if ((flags & FOLL_DUMP) ||

1578

!is_zero_pfn(pte_pfn(pte)))

1578

!is_zero_pfn(pte_pfn(pte)))

1579

goto bad_page;

1579

goto bad_page;

1580

page = pte_page(pte);

1580

page = pte_page(pte);

1581

}

1581

}

1582

1583

if (flags & FOLL_GET)

1583

if (flags & FOLL_GET)

1584

get_page_foll(page);

1584

get_page_foll(page);

1585

if (flags & FOLL_TOUCH) {

1585

if (flags & FOLL_TOUCH) {

1586

if ((flags & FOLL_WRITE) &&

1586

if ((flags & FOLL_WRITE) &&

1587

!pte_dirty(pte) && !PageDirty(page))

1587

!pte_dirty(pte) && !PageDirty(page))

1588

set_page_dirty(page);

1588

set_page_dirty(page);

1589

/*

1589

/*

1590

* pte_mkyoung() would be more correct here, but atomic care

1590

* pte_mkyoung() would be more correct here, but atomic care

1591

* is needed to avoid losing the dirty bit: it is easier to use

1591

* is needed to avoid losing the dirty bit: it is easier to use

1592

* mark_page_accessed().

1592

* mark_page_accessed().

1593

*/

1593

*/

1594

mark_page_accessed(page);

1594

mark_page_accessed(page);

1595

}

1595

}

1596

if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {

1596

if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {

1597

/*

1597

/*

1598

* The preliminary mapping check is mainly to avoid the

1598

* The preliminary mapping check is mainly to avoid the

1599

* pointless overhead of lock_page on the ZERO_PAGE

1599

* pointless overhead of lock_page on the ZERO_PAGE

1600

* which might bounce very badly if there is contention.

1600

* which might bounce very badly if there is contention.

1601

*

1601

*

1602

* If the page is already locked, we don't need to

1602

* If the page is already locked, we don't need to

1603

* handle it now - vmscan will handle it later if and

1603

* handle it now - vmscan will handle it later if and

1604

* when it attempts to reclaim the page.

1604

* when it attempts to reclaim the page.

1605

*/

1605

*/

1606

if (page->mapping && trylock_page(page)) {

1606

if (page->mapping && trylock_page(page)) {

1607

lru_add_drain(); /* push cached pages to LRU */

1607

lru_add_drain(); /* push cached pages to LRU */

1608

/*

1608

/*

1609

* Because we lock page here, and migration is

1609

* Because we lock page here, and migration is

1610

* blocked by the pte's page reference, and we

1610

* blocked by the pte's page reference, and we

1611

* know the page is still mapped, we don't even

1611

* know the page is still mapped, we don't even

1612

* need to check for file-cache page truncation.

1612

* need to check for file-cache page truncation.

1613

*/

1613

*/

1614

mlock_vma_page(page);

1614

mlock_vma_page(page);

1615

unlock_page(page);

1615

unlock_page(page);

1616

}

1616

}

1617

}

1617

}

1618

unlock:

1618

unlock:

1619

pte_unmap_unlock(ptep, ptl);

1619

pte_unmap_unlock(ptep, ptl);

1620

out:

1620

out:

1621

return page;

1621

return page;

1622

1623

bad_page:

1623

bad_page:

1624

pte_unmap_unlock(ptep, ptl);

1624

pte_unmap_unlock(ptep, ptl);

1625

return ERR_PTR(-EFAULT);

1625

return ERR_PTR(-EFAULT);

1626

1627

no_page:

1627

no_page:

1628

pte_unmap_unlock(ptep, ptl);

1628

pte_unmap_unlock(ptep, ptl);

1629

if (!pte_none(pte))

1629

if (!pte_none(pte))

1630

return page;

1630

return page;

1631

1632

no_page_table:

1632

no_page_table:

1633

/*

1633

/*

1634

* When core dumping an enormous anonymous area that nobody

1634

* When core dumping an enormous anonymous area that nobody

1635

* has touched so far, we don't want to allocate unnecessary pages or

1635

* has touched so far, we don't want to allocate unnecessary pages or

1636

* page tables. Return error instead of NULL to skip handle_mm_fault,

1636

* page tables. Return error instead of NULL to skip handle_mm_fault,

1637

* then get_dump_page() will return NULL to leave a hole in the dump.

1637

* then get_dump_page() will return NULL to leave a hole in the dump.

1638

* But we can only make this optimization where a hole would surely

1638

* But we can only make this optimization where a hole would surely

1639

* be zero-filled if handle_mm_fault() actually did handle it.

1639

* be zero-filled if handle_mm_fault() actually did handle it.

1640

*/

1640

*/

1641

if ((flags & FOLL_DUMP) &&

1641

if ((flags & FOLL_DUMP) &&

1642

(!vma->vm_ops || !vma->vm_ops->fault))

1642

(!vma->vm_ops || !vma->vm_ops->fault))

1643

return ERR_PTR(-EFAULT);

1643

return ERR_PTR(-EFAULT);

1644

return page;

1644

return page;

1645

}

1645

}

1646

1647

static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)

1647

static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)

1648

{

1648

{

1649

return stack_guard_page_start(vma, addr) ||

1649

return stack_guard_page_start(vma, addr) ||

1650

stack_guard_page_end(vma, addr+PAGE_SIZE);

1650

stack_guard_page_end(vma, addr+PAGE_SIZE);

1651

}

1651

}

1652

1653

/**

1653

/**

1654

* __get_user_pages() - pin user pages in memory

1654

* __get_user_pages() - pin user pages in memory

1655

* @tsk: task_struct of target task

1655

* @tsk: task_struct of target task

1656

* @mm: mm_struct of target mm

1656

* @mm: mm_struct of target mm

1657

* @start: starting user address

1657

* @start: starting user address

1658

* @nr_pages: number of pages from start to pin

1658

* @nr_pages: number of pages from start to pin

1659

* @gup_flags: flags modifying pin behaviour

1659

* @gup_flags: flags modifying pin behaviour

1660

* @pages: array that receives pointers to the pages pinned.

1660

* @pages: array that receives pointers to the pages pinned.

1661

* Should be at least nr_pages long. Or NULL, if caller

1661

* Should be at least nr_pages long. Or NULL, if caller

1662

* only intends to ensure the pages are faulted in.

1662

* only intends to ensure the pages are faulted in.

1663

* @vmas: array of pointers to vmas corresponding to each page.

1663

* @vmas: array of pointers to vmas corresponding to each page.

1664

* Or NULL if the caller does not require them.

1664

* Or NULL if the caller does not require them.

1665

* @nonblocking: whether waiting for disk IO or mmap_sem contention

1665

* @nonblocking: whether waiting for disk IO or mmap_sem contention

1666

*

1666

*

1667

* Returns number of pages pinned. This may be fewer than the number

1667

* Returns number of pages pinned. This may be fewer than the number

1668

* requested. If nr_pages is 0 or negative, returns 0. If no pages

1668

* requested. If nr_pages is 0 or negative, returns 0. If no pages

1669

* were pinned, returns -errno. Each page returned must be released

1669

* were pinned, returns -errno. Each page returned must be released

1670

* with a put_page() call when it is finished with. vmas will only

1670

* with a put_page() call when it is finished with. vmas will only

1671

* remain valid while mmap_sem is held.

1671

* remain valid while mmap_sem is held.

1672

*

1672

*

1673

* Must be called with mmap_sem held for read or write.

1673

* Must be called with mmap_sem held for read or write.

1674

*

1674

*

1675

* __get_user_pages walks a process's page tables and takes a reference to

1675

* __get_user_pages walks a process's page tables and takes a reference to

1676

* each struct page that each user address corresponds to at a given

1676

* each struct page that each user address corresponds to at a given

1677

* instant. That is, it takes the page that would be accessed if a user

1677

* instant. That is, it takes the page that would be accessed if a user

1678

* thread accesses the given user virtual address at that instant.

1678

* thread accesses the given user virtual address at that instant.

1679

*

1679

*

1680

* This does not guarantee that the page exists in the user mappings when

1680

* This does not guarantee that the page exists in the user mappings when

1681

* __get_user_pages returns, and there may even be a completely different

1681

* __get_user_pages returns, and there may even be a completely different

1682

* page there in some cases (eg. if mmapped pagecache has been invalidated

1682

* page there in some cases (eg. if mmapped pagecache has been invalidated

1683

* and subsequently re faulted). However it does guarantee that the page

1683

* and subsequently re faulted). However it does guarantee that the page

1684

* won't be freed completely. And mostly callers simply care that the page

1684

* won't be freed completely. And mostly callers simply care that the page

1685

* contains data that was valid *at some point in time*. Typically, an IO

1685

* contains data that was valid *at some point in time*. Typically, an IO

1686

* or similar operation cannot guarantee anything stronger anyway because

1686

* or similar operation cannot guarantee anything stronger anyway because

1687

* locks can't be held over the syscall boundary.

1687

* locks can't be held over the syscall boundary.

1688

*

1688

*

1689

* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If

1689

* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If

1690

* the page is written to, set_page_dirty (or set_page_dirty_lock, as

1690

* the page is written to, set_page_dirty (or set_page_dirty_lock, as

1691

* appropriate) must be called after the page is finished with, and

1691

* appropriate) must be called after the page is finished with, and

1692

* before put_page is called.

1692

* before put_page is called.

1693

*

1693

*

1694

* If @nonblocking != NULL, __get_user_pages will not wait for disk IO

1694

* If @nonblocking != NULL, __get_user_pages will not wait for disk IO

1695

* or mmap_sem contention, and if waiting is needed to pin all pages,

1695

* or mmap_sem contention, and if waiting is needed to pin all pages,

1696

* *@nonblocking will be set to 0.

1696

* *@nonblocking will be set to 0.

1697

*

1697

*

1698

* In most cases, get_user_pages or get_user_pages_fast should be used

1698

* In most cases, get_user_pages or get_user_pages_fast should be used

1699

* instead of __get_user_pages. __get_user_pages should be used only if

1699

* instead of __get_user_pages. __get_user_pages should be used only if

1700

* you need some special @gup_flags.

1700

* you need some special @gup_flags.

1701

*/

1701

*/

1702

long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

1702

long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

1703

unsigned long start, unsigned long nr_pages,

1703

unsigned long start, unsigned long nr_pages,

1704

unsigned int gup_flags, struct page **pages,

1704

unsigned int gup_flags, struct page **pages,

1705

struct vm_area_struct **vmas, int *nonblocking)

1705

struct vm_area_struct **vmas, int *nonblocking)

1706

{

1706

{

1707

long i;

1707

long i;

1708

unsigned long vm_flags;

1708

unsigned long vm_flags;

1709

unsigned int page_mask;

1709

unsigned int page_mask;

1710

1711

if (!nr_pages)

1711

if (!nr_pages)

1712

return 0;

1712

return 0;

1713

1714

VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

1714

VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

1715

1716

/*

1716

/*

1717

* Require read or write permissions.

1717

* Require read or write permissions.

1718

* If FOLL_FORCE is set, we only require the "MAY" flags.

1718

* If FOLL_FORCE is set, we only require the "MAY" flags.

1719

*/

1719

*/

1720

vm_flags = (gup_flags & FOLL_WRITE) ?

1720

vm_flags = (gup_flags & FOLL_WRITE) ?

1721

(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);

1721

(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);

1722

vm_flags &= (gup_flags & FOLL_FORCE) ?

1722

vm_flags &= (gup_flags & FOLL_FORCE) ?

1723

(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

1723

(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

1724

1725

/*

1725

/*

1726

* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault

1726

* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault

1727

* would be called on PROT_NONE ranges. We must never invoke

1727

* would be called on PROT_NONE ranges. We must never invoke

1728

* handle_mm_fault on PROT_NONE ranges or the NUMA hinting

1728

* handle_mm_fault on PROT_NONE ranges or the NUMA hinting

1729

* page faults would unprotect the PROT_NONE ranges if

1729

* page faults would unprotect the PROT_NONE ranges if

1730

* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd

1730

* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd

1731

* bitflag. So to avoid that, don't set FOLL_NUMA if

1731

* bitflag. So to avoid that, don't set FOLL_NUMA if

1732

* FOLL_FORCE is set.

1732

* FOLL_FORCE is set.

1733

*/

1733

*/

1734

if (!(gup_flags & FOLL_FORCE))

1734

if (!(gup_flags & FOLL_FORCE))

1735

gup_flags |= FOLL_NUMA;

1735

gup_flags |= FOLL_NUMA;

1736

1737

i = 0;

1737

i = 0;

1738

1739

do {

1739

do {

1740

struct vm_area_struct *vma;

1740

struct vm_area_struct *vma;

1741

1742

vma = find_extend_vma(mm, start);

1742

vma = find_extend_vma(mm, start);

1743

if (!vma && in_gate_area(mm, start)) {

1743

if (!vma && in_gate_area(mm, start)) {

1744

unsigned long pg = start & PAGE_MASK;

1744

unsigned long pg = start & PAGE_MASK;

1745

pgd_t *pgd;

1745

pgd_t *pgd;

1746

pud_t *pud;

1746

pud_t *pud;

1747

pmd_t *pmd;

1747

pmd_t *pmd;

1748

pte_t *pte;

1748

pte_t *pte;

1749

1750

/* user gate pages are read-only */

1750

/* user gate pages are read-only */

1751

if (gup_flags & FOLL_WRITE)

1751

if (gup_flags & FOLL_WRITE)

1752

return i ? : -EFAULT;

1752

return i ? : -EFAULT;

1753

if (pg > TASK_SIZE)

1753

if (pg > TASK_SIZE)

1754

pgd = pgd_offset_k(pg);

1754

pgd = pgd_offset_k(pg);

1755

else

1755

else

1756

pgd = pgd_offset_gate(mm, pg);

1756

pgd = pgd_offset_gate(mm, pg);

1757

BUG_ON(pgd_none(*pgd));

1757

BUG_ON(pgd_none(*pgd));

1758

pud = pud_offset(pgd, pg);

1758

pud = pud_offset(pgd, pg);

1759

BUG_ON(pud_none(*pud));

1759

BUG_ON(pud_none(*pud));

1760

pmd = pmd_offset(pud, pg);

1760

pmd = pmd_offset(pud, pg);

1761

if (pmd_none(*pmd))

1761

if (pmd_none(*pmd))

1762

return i ? : -EFAULT;

1762

return i ? : -EFAULT;

1763

VM_BUG_ON(pmd_trans_huge(*pmd));

1763

VM_BUG_ON(pmd_trans_huge(*pmd));

1764

pte = pte_offset_map(pmd, pg);

1764

pte = pte_offset_map(pmd, pg);

1765

if (pte_none(*pte)) {

1765

if (pte_none(*pte)) {

1766

pte_unmap(pte);

1766

pte_unmap(pte);

1767

return i ? : -EFAULT;

1767

return i ? : -EFAULT;

1768

}

1768

}

1769

vma = get_gate_vma(mm);

1769

vma = get_gate_vma(mm);

1770

if (pages) {

1770

if (pages) {

1771

struct page *page;

1771

struct page *page;

1772

1773

page = vm_normal_page(vma, start, *pte);

1773

page = vm_normal_page(vma, start, *pte);

1774

if (!page) {

1774

if (!page) {

1775

if (!(gup_flags & FOLL_DUMP) &&

1775

if (!(gup_flags & FOLL_DUMP) &&

1776

is_zero_pfn(pte_pfn(*pte)))

1776

is_zero_pfn(pte_pfn(*pte)))

1777

page = pte_page(*pte);

1777

page = pte_page(*pte);

1778

else {

1778

else {

1779

pte_unmap(pte);

1779

pte_unmap(pte);

1780

return i ? : -EFAULT;

1780

return i ? : -EFAULT;

1781

}

1781

}

1782

}

1782

}

1783

pages[i] = page;

1783

pages[i] = page;

1784

get_page(page);

1784

get_page(page);

1785

}

1785

}

1786

pte_unmap(pte);

1786

pte_unmap(pte);

1787

page_mask = 0;

1787

page_mask = 0;

1788

goto next_page;

1788

goto next_page;

1789

}

1789

}

1790

1791

if (!vma ||

1791

if (!vma ||

1792

(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||

1792

(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||

1793

!(vm_flags & vma->vm_flags))

1793

!(vm_flags & vma->vm_flags))

1794

return i ? : -EFAULT;

1794

return i ? : -EFAULT;

1795

1796

if (is_vm_hugetlb_page(vma)) {

1796

if (is_vm_hugetlb_page(vma)) {

1797

i = follow_hugetlb_page(mm, vma, pages, vmas,

1797

i = follow_hugetlb_page(mm, vma, pages, vmas,

1798

&start, &nr_pages, i, gup_flags);

1798

&start, &nr_pages, i, gup_flags);

1799

continue;

1799

continue;

1800

}

1800

}

1801

1802

do {

1802

do {

1803

struct page *page;

1803

struct page *page;

1804

unsigned int foll_flags = gup_flags;

1804

unsigned int foll_flags = gup_flags;

1805

unsigned int page_increm;

1805

unsigned int page_increm;

1806

1807

/*

1807

/*

1808

* If we have a pending SIGKILL, don't keep faulting

1808

* If we have a pending SIGKILL, don't keep faulting

1809

* pages and potentially allocating memory.

1809

* pages and potentially allocating memory.

1810

*/

1810

*/

1811

if (unlikely(fatal_signal_pending(current)))

1811

if (unlikely(fatal_signal_pending(current)))

1812

return i ? i : -ERESTARTSYS;

1812

return i ? i : -ERESTARTSYS;

1813

1814

cond_resched();

1814

cond_resched();

1815

while (!(page = follow_page_mask(vma, start,

1815

while (!(page = follow_page_mask(vma, start,

1816

foll_flags, &page_mask))) {

1816

foll_flags, &page_mask))) {

1817

int ret;

1817

int ret;

1818

unsigned int fault_flags = 0;

1818

unsigned int fault_flags = 0;

1819

1820

/* For mlock, just skip the stack guard page. */

1820

/* For mlock, just skip the stack guard page. */

1821

if (foll_flags & FOLL_MLOCK) {

1821

if (foll_flags & FOLL_MLOCK) {

1822

if (stack_guard_page(vma, start))

1822

if (stack_guard_page(vma, start))

1823

goto next_page;

1823

goto next_page;

1824

}

1824

}

1825

if (foll_flags & FOLL_WRITE)

1825

if (foll_flags & FOLL_WRITE)

1826

fault_flags |= FAULT_FLAG_WRITE;

1826

fault_flags |= FAULT_FLAG_WRITE;

1827

if (nonblocking)

1827

if (nonblocking)

1828

fault_flags |= FAULT_FLAG_ALLOW_RETRY;

1828

fault_flags |= FAULT_FLAG_ALLOW_RETRY;

1829

if (foll_flags & FOLL_NOWAIT)

1829

if (foll_flags & FOLL_NOWAIT)

1830

fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);

1830

fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);

1831

1832

ret = handle_mm_fault(mm, vma, start,

1832

ret = handle_mm_fault(mm, vma, start,

1833

fault_flags);

1833

fault_flags);

1834

1835

if (ret & VM_FAULT_ERROR) {

1835

if (ret & VM_FAULT_ERROR) {

1836

if (ret & VM_FAULT_OOM)

1836

if (ret & VM_FAULT_OOM)

1837

return i ? i : -ENOMEM;

1837

return i ? i : -ENOMEM;

1838

if (ret & (VM_FAULT_HWPOISON |

1838

if (ret & (VM_FAULT_HWPOISON |

1839

VM_FAULT_HWPOISON_LARGE)) {

1839

VM_FAULT_HWPOISON_LARGE)) {

1840

if (i)

1840

if (i)

1841

return i;

1841

return i;

1842

else if (gup_flags & FOLL_HWPOISON)

1842

else if (gup_flags & FOLL_HWPOISON)

1843

return -EHWPOISON;

1843

return -EHWPOISON;

1844

else

1844

else

1845

return -EFAULT;

1845

return -EFAULT;

1846

}

1846

}

1847

if (ret & VM_FAULT_SIGBUS)

1847

if (ret & VM_FAULT_SIGBUS)

1848

return i ? i : -EFAULT;

1848

return i ? i : -EFAULT;

1849

BUG();

1849

BUG();

1850

}

1850

}

1851

1852

if (tsk) {

1852

if (tsk) {

1853

if (ret & VM_FAULT_MAJOR)

1853

if (ret & VM_FAULT_MAJOR)

1854

tsk->maj_flt++;

1854

tsk->maj_flt++;

1855

else

1855

else

1856

tsk->min_flt++;

1856

tsk->min_flt++;

1857

}

1857

}

1858

1859

if (ret & VM_FAULT_RETRY) {

1859

if (ret & VM_FAULT_RETRY) {

1860

if (nonblocking)

1860

if (nonblocking)

1861

*nonblocking = 0;

1861

*nonblocking = 0;

1862

return i;

1862

return i;

1863

}

1863

}

1864

1865

/*

1865

/*

1866

* The VM_FAULT_WRITE bit tells us that

1866

* The VM_FAULT_WRITE bit tells us that

1867

* do_wp_page has broken COW when necessary,

1867

* do_wp_page has broken COW when necessary,

1868

* even if maybe_mkwrite decided not to set

1868

* even if maybe_mkwrite decided not to set

1869

* pte_write. We can thus safely do subsequent

1869

* pte_write. We can thus safely do subsequent

1870

* page lookups as if they were reads. But only

1870

* page lookups as if they were reads. But only

1871

* do so when looping for pte_write is futile:

1871

* do so when looping for pte_write is futile:

1872

* in some cases userspace may also be wanting

1872

* in some cases userspace may also be wanting

1873

* to write to the gotten user page, which a

1873

* to write to the gotten user page, which a

1874

* read fault here might prevent (a readonly

1874

* read fault here might prevent (a readonly

1875

* page might get reCOWed by userspace write).

1875

* page might get reCOWed by userspace write).

1876

*/

1876

*/

1877

if ((ret & VM_FAULT_WRITE) &&

1877

if ((ret & VM_FAULT_WRITE) &&

1878

!(vma->vm_flags & VM_WRITE))

1878

!(vma->vm_flags & VM_WRITE))

1879

foll_flags &= ~FOLL_WRITE;

1879

foll_flags &= ~FOLL_WRITE;

1880

1881

cond_resched();

1881

cond_resched();

1882

}

1882

}

1883

if (IS_ERR(page))

1883

if (IS_ERR(page))

1884

return i ? i : PTR_ERR(page);

1884

return i ? i : PTR_ERR(page);

1885

if (pages) {

1885

if (pages) {

1886

pages[i] = page;

1886

pages[i] = page;

1887

1888

flush_anon_page(vma, page, start);

1888

flush_anon_page(vma, page, start);

1889

flush_dcache_page(page);

1889

flush_dcache_page(page);

1890

page_mask = 0;

1890

page_mask = 0;

1891

}

1891

}

1892

next_page:

1892

next_page:

1893

if (vmas) {

1893

if (vmas) {

1894

vmas[i] = vma;

1894

vmas[i] = vma;

1895

page_mask = 0;

1895

page_mask = 0;

1896

}

1896

}

1897

page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);

1897

page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);

1898

if (page_increm > nr_pages)

1898

if (page_increm > nr_pages)

1899

page_increm = nr_pages;

1899

page_increm = nr_pages;

1900

i += page_increm;

1900

i += page_increm;

1901

start += page_increm * PAGE_SIZE;

1901

start += page_increm * PAGE_SIZE;

1902

nr_pages -= page_increm;

1902

nr_pages -= page_increm;

1903

} while (nr_pages && start < vma->vm_end);

1903

} while (nr_pages && start < vma->vm_end);

1904

} while (nr_pages);

1904

} while (nr_pages);

1905

return i;

1905

return i;

1906

}

1906

}

1907

EXPORT_SYMBOL(__get_user_pages);

1907

EXPORT_SYMBOL(__get_user_pages);

1908

1909

/*

1909

/*

1910

* fixup_user_fault() - manually resolve a user page fault

1910

* fixup_user_fault() - manually resolve a user page fault

1911

* @tsk: the task_struct to use for page fault accounting, or

1911

* @tsk: the task_struct to use for page fault accounting, or

1912

* NULL if faults are not to be recorded.

1912

* NULL if faults are not to be recorded.

1913

* @mm: mm_struct of target mm

1913

* @mm: mm_struct of target mm

1914

* @address: user address

1914

* @address: user address

1915

* @fault_flags:flags to pass down to handle_mm_fault()

1915

* @fault_flags:flags to pass down to handle_mm_fault()

1916

*

1916

*

1917

* This is meant to be called in the specific scenario where for locking reasons

1917

* This is meant to be called in the specific scenario where for locking reasons

1918

* we try to access user memory in atomic context (within a pagefault_disable()

1918

* we try to access user memory in atomic context (within a pagefault_disable()

1919

* section), this returns -EFAULT, and we want to resolve the user fault before

1919

* section), this returns -EFAULT, and we want to resolve the user fault before

1920

* trying again.

1920

* trying again.

1921

*

1921

*

1922

* Typically this is meant to be used by the futex code.

1922

* Typically this is meant to be used by the futex code.

1923

*

1923

*

1924

* The main difference with get_user_pages() is that this function will

1924

* The main difference with get_user_pages() is that this function will

1925

* unconditionally call handle_mm_fault() which will in turn perform all the

1925

* unconditionally call handle_mm_fault() which will in turn perform all the

1926

* necessary SW fixup of the dirty and young bits in the PTE, while

1926

* necessary SW fixup of the dirty and young bits in the PTE, while

1927

* handle_mm_fault() only guarantees to update these in the struct page.

1927

* handle_mm_fault() only guarantees to update these in the struct page.

1928

*

1928

*

1929

* This is important for some architectures where those bits also gate the

1929

* This is important for some architectures where those bits also gate the

1930

* access permission to the page because they are maintained in software. On

1930

* access permission to the page because they are maintained in software. On

1931

* such architectures, gup() will not be enough to make a subsequent access

1931

* such architectures, gup() will not be enough to make a subsequent access

1932

* succeed.

1932

* succeed.

1933

*

1933

*

1934

* This should be called with the mm_sem held for read.

1934

* This should be called with the mm_sem held for read.

1935

*/

1935

*/

1936

int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,

1936

int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,

1937

unsigned long address, unsigned int fault_flags)

1937

unsigned long address, unsigned int fault_flags)

1938

{

1938

{

1939

struct vm_area_struct *vma;

1939

struct vm_area_struct *vma;

1940

int ret;

1940

int ret;

1941

1942

vma = find_extend_vma(mm, address);

1942

vma = find_extend_vma(mm, address);

1943

if (!vma || address < vma->vm_start)

1943

if (!vma || address < vma->vm_start)

1944

return -EFAULT;

1944

return -EFAULT;

1945

1946

ret = handle_mm_fault(mm, vma, address, fault_flags);

1946

ret = handle_mm_fault(mm, vma, address, fault_flags);

1947

if (ret & VM_FAULT_ERROR) {

1947

if (ret & VM_FAULT_ERROR) {

1948

if (ret & VM_FAULT_OOM)

1948

if (ret & VM_FAULT_OOM)

1949

return -ENOMEM;

1949

return -ENOMEM;

1950

if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))

1950

if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))

1951

return -EHWPOISON;

1951

return -EHWPOISON;

1952

if (ret & VM_FAULT_SIGBUS)

1952

if (ret & VM_FAULT_SIGBUS)

1953

return -EFAULT;

1953

return -EFAULT;

1954

BUG();

1954

BUG();

1955

}

1955

}

1956

if (tsk) {

1956

if (tsk) {

1957

if (ret & VM_FAULT_MAJOR)

1957

if (ret & VM_FAULT_MAJOR)

1958

tsk->maj_flt++;

1958

tsk->maj_flt++;

1959

else

1959

else

1960

tsk->min_flt++;

1960

tsk->min_flt++;

1961

}

1961

}

1962

return 0;

1962

return 0;

1963

}

1963

}

1964

1965

/*

1965

/*

1966

* get_user_pages() - pin user pages in memory

1966

* get_user_pages() - pin user pages in memory

1967

* @tsk: the task_struct to use for page fault accounting, or

1967

* @tsk: the task_struct to use for page fault accounting, or

1968

* NULL if faults are not to be recorded.

1968

* NULL if faults are not to be recorded.

1969

* @mm: mm_struct of target mm

1969

* @mm: mm_struct of target mm

1970

* @start: starting user address

1970

* @start: starting user address

1971

* @nr_pages: number of pages from start to pin

1971

* @nr_pages: number of pages from start to pin

1972

* @write: whether pages will be written to by the caller

1972

* @write: whether pages will be written to by the caller

1973

* @force: whether to force write access even if user mapping is

1973

* @force: whether to force write access even if user mapping is

1974

* readonly. This will result in the page being COWed even

1974

* readonly. This will result in the page being COWed even

1975

* in MAP_SHARED mappings. You do not want this.

1975

* in MAP_SHARED mappings. You do not want this.

1976

* @pages: array that receives pointers to the pages pinned.

1976

* @pages: array that receives pointers to the pages pinned.

1977

* Should be at least nr_pages long. Or NULL, if caller

1977

* Should be at least nr_pages long. Or NULL, if caller

1978

* only intends to ensure the pages are faulted in.

1978

* only intends to ensure the pages are faulted in.

1979

* @vmas: array of pointers to vmas corresponding to each page.

1979

* @vmas: array of pointers to vmas corresponding to each page.

1980

* Or NULL if the caller does not require them.

1980

* Or NULL if the caller does not require them.

1981

*

1981

*

1982

* Returns number of pages pinned. This may be fewer than the number

1982

* Returns number of pages pinned. This may be fewer than the number

1983

* requested. If nr_pages is 0 or negative, returns 0. If no pages

1983

* requested. If nr_pages is 0 or negative, returns 0. If no pages

1984

* were pinned, returns -errno. Each page returned must be released

1984

* were pinned, returns -errno. Each page returned must be released

1985

* with a put_page() call when it is finished with. vmas will only

1985

* with a put_page() call when it is finished with. vmas will only

1986

* remain valid while mmap_sem is held.

1986

* remain valid while mmap_sem is held.

1987

*

1987

*

1988

* Must be called with mmap_sem held for read or write.

1988

* Must be called with mmap_sem held for read or write.

1989

*

1989

*

1990

* get_user_pages walks a process's page tables and takes a reference to

1990

* get_user_pages walks a process's page tables and takes a reference to

1991

* each struct page that each user address corresponds to at a given

1991

* each struct page that each user address corresponds to at a given

1992

* instant. That is, it takes the page that would be accessed if a user

1992

* instant. That is, it takes the page that would be accessed if a user

1993

* thread accesses the given user virtual address at that instant.

1993

* thread accesses the given user virtual address at that instant.

1994

*

1994

*

1995

* This does not guarantee that the page exists in the user mappings when

1995

* This does not guarantee that the page exists in the user mappings when

1996

* get_user_pages returns, and there may even be a completely different

1996

* get_user_pages returns, and there may even be a completely different

1997

* page there in some cases (eg. if mmapped pagecache has been invalidated

1997

* page there in some cases (eg. if mmapped pagecache has been invalidated

1998

* and subsequently re faulted). However it does guarantee that the page

1998

* and subsequently re faulted). However it does guarantee that the page

1999

* won't be freed completely. And mostly callers simply care that the page

1999

* won't be freed completely. And mostly callers simply care that the page

2000

* contains data that was valid *at some point in time*. Typically, an IO

2000

* contains data that was valid *at some point in time*. Typically, an IO

2001

* or similar operation cannot guarantee anything stronger anyway because

2001

* or similar operation cannot guarantee anything stronger anyway because

2002

* locks can't be held over the syscall boundary.

2002

* locks can't be held over the syscall boundary.

2003

*

2003

*

2004

* If write=0, the page must not be written to. If the page is written to,

2004

* If write=0, the page must not be written to. If the page is written to,

2005

* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called

2005

* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called

2006

* after the page is finished with, and before put_page is called.

2006

* after the page is finished with, and before put_page is called.

2007

*

2007

*

2008

* get_user_pages is typically used for fewer-copy IO operations, to get a

2008

* get_user_pages is typically used for fewer-copy IO operations, to get a

2009

* handle on the memory by some means other than accesses via the user virtual

2009

* handle on the memory by some means other than accesses via the user virtual

2010

* addresses. The pages may be submitted for DMA to devices or accessed via

2010

* addresses. The pages may be submitted for DMA to devices or accessed via

2011

* their kernel linear mapping (via the kmap APIs). Care should be taken to

2011

* their kernel linear mapping (via the kmap APIs). Care should be taken to

2012

* use the correct cache flushing APIs.

2012

* use the correct cache flushing APIs.

2013

*

2013

*

2014

* See also get_user_pages_fast, for performance critical applications.

2014

* See also get_user_pages_fast, for performance critical applications.

2015

*/

2015

*/

2016

long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

2016

long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

2017

unsigned long start, unsigned long nr_pages, int write,

2017

unsigned long start, unsigned long nr_pages, int write,

2018

int force, struct page **pages, struct vm_area_struct **vmas)

2018

int force, struct page **pages, struct vm_area_struct **vmas)

2019

{

2019

{

2020

int flags = FOLL_TOUCH;

2020

int flags = FOLL_TOUCH;

2021

2022

if (pages)

2022

if (pages)

2023

flags |= FOLL_GET;

2023

flags |= FOLL_GET;

2024

if (write)

2024

if (write)

2025

flags |= FOLL_WRITE;

2025

flags |= FOLL_WRITE;

2026

if (force)

2026

if (force)

2027

flags |= FOLL_FORCE;

2027

flags |= FOLL_FORCE;

2028

2029

return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,

2029

return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,

2030

NULL);

2030

NULL);

2031

}

2031

}

2032

EXPORT_SYMBOL(get_user_pages);

2032

EXPORT_SYMBOL(get_user_pages);

2033

2034

/**

2034

/**

2035

* get_dump_page() - pin user page in memory while writing it to core dump

2035

* get_dump_page() - pin user page in memory while writing it to core dump

2036

* @addr: user address

2036

* @addr: user address

2037

*

2037

*

2038

* Returns struct page pointer of user page pinned for dump,

2038

* Returns struct page pointer of user page pinned for dump,

2039

* to be freed afterwards by page_cache_release() or put_page().

2039

* to be freed afterwards by page_cache_release() or put_page().

2040

*

2040

*

2041

* Returns NULL on any kind of failure - a hole must then be inserted into

2041

* Returns NULL on any kind of failure - a hole must then be inserted into

2042

* the corefile, to preserve alignment with its headers; and also returns

2042

* the corefile, to preserve alignment with its headers; and also returns

2043

* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -

2043

* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -

2044

* allowing a hole to be left in the corefile to save diskspace.

2044

* allowing a hole to be left in the corefile to save diskspace.

2045

*

2045

*

2046

* Called without mmap_sem, but after all other threads have been killed.

2046

* Called without mmap_sem, but after all other threads have been killed.

2047

*/

2047

*/

2048

#ifdef CONFIG_ELF_CORE

2048

#ifdef CONFIG_ELF_CORE

2049

struct page *get_dump_page(unsigned long addr)

2049

struct page *get_dump_page(unsigned long addr)

2050

{

2050

{

2051

struct vm_area_struct *vma;

2051

struct vm_area_struct *vma;

2052

struct page *page;

2052

struct page *page;

2053

2054

if (__get_user_pages(current, current->mm, addr, 1,

2054

if (__get_user_pages(current, current->mm, addr, 1,

2055

FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,

2055

FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,

2056

NULL) < 1)

2056

NULL) < 1)

2057

return NULL;

2057

return NULL;

2058

flush_cache_page(vma, addr, page_to_pfn(page));

2058

flush_cache_page(vma, addr, page_to_pfn(page));

2059

return page;

2059

return page;

2060

}

2060

}

2061

#endif /* CONFIG_ELF_CORE */

2061

#endif /* CONFIG_ELF_CORE */

2062

2063

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,

2063

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,

2064

spinlock_t **ptl)

2064

spinlock_t **ptl)

2065

{

2065

{

2066

pgd_t * pgd = pgd_offset(mm, addr);

2066

pgd_t * pgd = pgd_offset(mm, addr);

2067

pud_t * pud = pud_alloc(mm, pgd, addr);

2067

pud_t * pud = pud_alloc(mm, pgd, addr);

2068

if (pud) {

2068

if (pud) {

2069

pmd_t * pmd = pmd_alloc(mm, pud, addr);

2069

pmd_t * pmd = pmd_alloc(mm, pud, addr);

2070

if (pmd) {

2070

if (pmd) {

2071

VM_BUG_ON(pmd_trans_huge(*pmd));

2071

VM_BUG_ON(pmd_trans_huge(*pmd));

2072

return pte_alloc_map_lock(mm, pmd, addr, ptl);

2072

return pte_alloc_map_lock(mm, pmd, addr, ptl);

2073

}

2073

}

2074

}

2074

}

2075

return NULL;

2075

return NULL;

2076

}

2076

}

2077

2078

/*

2078

/*

2079

* This is the old fallback for page remapping.

2079

* This is the old fallback for page remapping.

2080

*

2080

*

2081

* For historical reasons, it only allows reserved pages. Only

2081

* For historical reasons, it only allows reserved pages. Only

2082

* old drivers should use this, and they needed to mark their

2082

* old drivers should use this, and they needed to mark their

2083

* pages reserved for the old functions anyway.

2083

* pages reserved for the old functions anyway.

2084

*/

2084

*/

2085

static int insert_page(struct vm_area_struct *vma, unsigned long addr,

2085

static int insert_page(struct vm_area_struct *vma, unsigned long addr,

2086

struct page *page, pgprot_t prot)

2086

struct page *page, pgprot_t prot)

2087

{

2087

{

2088

struct mm_struct *mm = vma->vm_mm;

2088

struct mm_struct *mm = vma->vm_mm;

2089

int retval;

2089

int retval;

2090

pte_t *pte;

2090

pte_t *pte;

2091

spinlock_t *ptl;

2091

spinlock_t *ptl;

2092

2093

retval = -EINVAL;

2093

retval = -EINVAL;

2094

if (PageAnon(page))

2094

if (PageAnon(page))

2095

goto out;

2095

goto out;

2096

retval = -ENOMEM;

2096

retval = -ENOMEM;

2097

flush_dcache_page(page);

2097

flush_dcache_page(page);

2098

pte = get_locked_pte(mm, addr, &ptl);

2098

pte = get_locked_pte(mm, addr, &ptl);

2099

if (!pte)

2099

if (!pte)

2100

goto out;

2100

goto out;

2101

retval = -EBUSY;

2101

retval = -EBUSY;

2102

if (!pte_none(*pte))

2102

if (!pte_none(*pte))

2103

goto out_unlock;

2103

goto out_unlock;

2104

2105

/* Ok, finally just insert the thing.. */

2105

/* Ok, finally just insert the thing.. */

2106

get_page(page);

2106

get_page(page);

2107

inc_mm_counter_fast(mm, MM_FILEPAGES);

2107

inc_mm_counter_fast(mm, MM_FILEPAGES);

2108

page_add_file_rmap(page);

2108

page_add_file_rmap(page);

2109

set_pte_at(mm, addr, pte, mk_pte(page, prot));

2109

set_pte_at(mm, addr, pte, mk_pte(page, prot));

2110

2111

retval = 0;

2111

retval = 0;

2112

pte_unmap_unlock(pte, ptl);

2112

pte_unmap_unlock(pte, ptl);

2113

return retval;

2113

return retval;

2114

out_unlock:

2114

out_unlock:

2115

pte_unmap_unlock(pte, ptl);

2115

pte_unmap_unlock(pte, ptl);

2116

out:

2116

out:

2117

return retval;

2117

return retval;

2118

}

2118

}

2119

2120

/**

2120

/**

2121

* vm_insert_page - insert single page into user vma

2121

* vm_insert_page - insert single page into user vma

2122

* @vma: user vma to map to

2122

* @vma: user vma to map to

2123

* @addr: target user address of this page

2123

* @addr: target user address of this page

2124

* @page: source kernel page

2124

* @page: source kernel page

2125

*

2125

*

2126

* This allows drivers to insert individual pages they've allocated

2126

* This allows drivers to insert individual pages they've allocated

2127

* into a user vma.

2127

* into a user vma.

2128

*

2128

*

2129

* The page has to be a nice clean _individual_ kernel allocation.

2129

* The page has to be a nice clean _individual_ kernel allocation.

2130

* If you allocate a compound page, you need to have marked it as

2130

* If you allocate a compound page, you need to have marked it as

2131

* such (__GFP_COMP), or manually just split the page up yourself

2131

* such (__GFP_COMP), or manually just split the page up yourself

2132

* (see split_page()).

2132

* (see split_page()).

2133

*

2133

*

2134

* NOTE! Traditionally this was done with "remap_pfn_range()" which

2134

* NOTE! Traditionally this was done with "remap_pfn_range()" which

2135

* took an arbitrary page protection parameter. This doesn't allow

2135

* took an arbitrary page protection parameter. This doesn't allow

2136

* that. Your vma protection will have to be set up correctly, which

2136

* that. Your vma protection will have to be set up correctly, which

2137

* means that if you want a shared writable mapping, you'd better

2137

* means that if you want a shared writable mapping, you'd better

2138

* ask for a shared writable mapping!

2138

* ask for a shared writable mapping!

2139

*

2139

*

2140

* The page does not need to be reserved.

2140

* The page does not need to be reserved.

2141

*

2141

*

2142

* Usually this function is called from f_op->mmap() handler

2142

* Usually this function is called from f_op->mmap() handler

2143

* under mm->mmap_sem write-lock, so it can change vma->vm_flags.

2143

* under mm->mmap_sem write-lock, so it can change vma->vm_flags.

2144

* Caller must set VM_MIXEDMAP on vma if it wants to call this

2144

* Caller must set VM_MIXEDMAP on vma if it wants to call this

2145

* function from other places, for example from page-fault handler.

2145

* function from other places, for example from page-fault handler.

2146

*/

2146

*/

2147

int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,

2147

int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,

2148

struct page *page)

2148

struct page *page)

2149

{

2149

{

2150

if (addr < vma->vm_start || addr >= vma->vm_end)

2150

if (addr < vma->vm_start || addr >= vma->vm_end)

2151

return -EFAULT;

2151

return -EFAULT;

2152

if (!page_count(page))

2152

if (!page_count(page))

2153

return -EINVAL;

2153

return -EINVAL;

2154

if (!(vma->vm_flags & VM_MIXEDMAP)) {

2154

if (!(vma->vm_flags & VM_MIXEDMAP)) {

2155

BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));

2155

BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));

2156

BUG_ON(vma->vm_flags & VM_PFNMAP);

2156

BUG_ON(vma->vm_flags & VM_PFNMAP);

2157

vma->vm_flags |= VM_MIXEDMAP;

2157

vma->vm_flags |= VM_MIXEDMAP;

2158

}

2158

}

2159

return insert_page(vma, addr, page, vma->vm_page_prot);

2159

return insert_page(vma, addr, page, vma->vm_page_prot);

2160

}

2160

}

2161

EXPORT_SYMBOL(vm_insert_page);

2161

EXPORT_SYMBOL(vm_insert_page);

2162

2163

static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,

2163

static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,

2164

unsigned long pfn, pgprot_t prot)

2164

unsigned long pfn, pgprot_t prot)

2165

{

2165

{

2166

struct mm_struct *mm = vma->vm_mm;

2166

struct mm_struct *mm = vma->vm_mm;

2167

int retval;

2167

int retval;

2168

pte_t *pte, entry;

2168

pte_t *pte, entry;

2169

spinlock_t *ptl;

2169

spinlock_t *ptl;

2170

2171

retval = -ENOMEM;

2171

retval = -ENOMEM;

2172

pte = get_locked_pte(mm, addr, &ptl);

2172

pte = get_locked_pte(mm, addr, &ptl);

2173

if (!pte)

2173

if (!pte)

2174

goto out;

2174

goto out;

2175

retval = -EBUSY;

2175

retval = -EBUSY;

2176

if (!pte_none(*pte))

2176

if (!pte_none(*pte))

2177

goto out_unlock;

2177

goto out_unlock;

2178

2179

/* Ok, finally just insert the thing.. */

2179

/* Ok, finally just insert the thing.. */

2180

entry = pte_mkspecial(pfn_pte(pfn, prot));

2180

entry = pte_mkspecial(pfn_pte(pfn, prot));

2181

set_pte_at(mm, addr, pte, entry);

2181

set_pte_at(mm, addr, pte, entry);

2182

update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

2182

update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

2183

2184

retval = 0;

2184

retval = 0;

2185

out_unlock:

2185

out_unlock:

2186

pte_unmap_unlock(pte, ptl);

2186

pte_unmap_unlock(pte, ptl);

2187

out:

2187

out:

2188

return retval;

2188

return retval;

2189

}

2189

}

2190

2191

/**

2191

/**

2192

* vm_insert_pfn - insert single pfn into user vma

2192

* vm_insert_pfn - insert single pfn into user vma

2193

* @vma: user vma to map to

2193

* @vma: user vma to map to

2194

* @addr: target user address of this page

2194

* @addr: target user address of this page

2195

* @pfn: source kernel pfn

2195

* @pfn: source kernel pfn

2196

*

2196

*

2197

* Similar to vm_insert_page, this allows drivers to insert individual pages

2197

* Similar to vm_insert_page, this allows drivers to insert individual pages

2198

* they've allocated into a user vma. Same comments apply.

2198

* they've allocated into a user vma. Same comments apply.

2199

*

2199

*

2200

* This function should only be called from a vm_ops->fault handler, and

2200

* This function should only be called from a vm_ops->fault handler, and

2201

* in that case the handler should return NULL.

2201

* in that case the handler should return NULL.

2202

*

2202

*

2203

* vma cannot be a COW mapping.

2203

* vma cannot be a COW mapping.

2204

*

2204

*

2205

* As this is called only for pages that do not currently exist, we

2205

* As this is called only for pages that do not currently exist, we

2206

* do not need to flush old virtual caches or the TLB.

2206

* do not need to flush old virtual caches or the TLB.

2207

*/

2207

*/

2208

int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,

2208

int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,

2209

unsigned long pfn)

2209

unsigned long pfn)

2210

{

2210

{

2211

int ret;

2211

int ret;

2212

pgprot_t pgprot = vma->vm_page_prot;

2212

pgprot_t pgprot = vma->vm_page_prot;

2213

/*

2213

/*

2214

* Technically, architectures with pte_special can avoid all these

2214

* Technically, architectures with pte_special can avoid all these

2215

* restrictions (same for remap_pfn_range). However we would like

2215

* restrictions (same for remap_pfn_range). However we would like

2216

* consistency in testing and feature parity among all, so we should

2216

* consistency in testing and feature parity among all, so we should

2217

* try to keep these invariants in place for everybody.

2217

* try to keep these invariants in place for everybody.

2218

*/

2218

*/

2219

BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));

2219

BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));

2220

BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==

2220

BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==

2221

(VM_PFNMAP|VM_MIXEDMAP));

2221

(VM_PFNMAP|VM_MIXEDMAP));

2222

BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

2222

BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

2223

BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

2223

BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

2224

2225

if (addr < vma->vm_start || addr >= vma->vm_end)

2225

if (addr < vma->vm_start || addr >= vma->vm_end)

2226

return -EFAULT;

2226

return -EFAULT;

2227

if (track_pfn_insert(vma, &pgprot, pfn))

2227

if (track_pfn_insert(vma, &pgprot, pfn))

2228

return -EINVAL;

2228

return -EINVAL;

2229

2230

ret = insert_pfn(vma, addr, pfn, pgprot);

2230

ret = insert_pfn(vma, addr, pfn, pgprot);

2231

2232

return ret;

2232

return ret;

2233

}

2233

}

2234

EXPORT_SYMBOL(vm_insert_pfn);

2234

EXPORT_SYMBOL(vm_insert_pfn);

2235

2236

int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,

2236

int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,

2237

unsigned long pfn)

2237

unsigned long pfn)

2238

{

2238

{

2239

BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));

2239

BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));

2240

2241

if (addr < vma->vm_start || addr >= vma->vm_end)

2241

if (addr < vma->vm_start || addr >= vma->vm_end)

2242

return -EFAULT;

2242

return -EFAULT;

2243

2244

/*

2244

/*

2245

* If we don't have pte special, then we have to use the pfn_valid()

2245

* If we don't have pte special, then we have to use the pfn_valid()

2246

* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*

2246

* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*

2247

* refcount the page if pfn_valid is true (hence insert_page rather

2247

* refcount the page if pfn_valid is true (hence insert_page rather

2248

* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP

2248

* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP

2249

* without pte special, it would there be refcounted as a normal page.

2249

* without pte special, it would there be refcounted as a normal page.

2250

*/

2250

*/

2251

if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {

2251

if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {

2252

struct page *page;

2252

struct page *page;

2253

2254

page = pfn_to_page(pfn);

2254

page = pfn_to_page(pfn);

2255

return insert_page(vma, addr, page, vma->vm_page_prot);

2255

return insert_page(vma, addr, page, vma->vm_page_prot);

2256

}

2256

}

2257

return insert_pfn(vma, addr, pfn, vma->vm_page_prot);

2257

return insert_pfn(vma, addr, pfn, vma->vm_page_prot);

2258

}

2258

}

2259

EXPORT_SYMBOL(vm_insert_mixed);

2259

EXPORT_SYMBOL(vm_insert_mixed);

2260

2261

/*

2261

/*

2262

* maps a range of physical memory into the requested pages. the old

2262

* maps a range of physical memory into the requested pages. the old

2263

* mappings are removed. any references to nonexistent pages results

2263

* mappings are removed. any references to nonexistent pages results

2264

* in null mappings (currently treated as "copy-on-access")

2264

* in null mappings (currently treated as "copy-on-access")

2265

*/

2265

*/

2266

static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,

2266

static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,

2267

unsigned long addr, unsigned long end,

2267

unsigned long addr, unsigned long end,

2268

unsigned long pfn, pgprot_t prot)

2268

unsigned long pfn, pgprot_t prot)

2269

{

2269

{

2270

pte_t *pte;

2270

pte_t *pte;

2271

spinlock_t *ptl;

2271

spinlock_t *ptl;

2272

2273

pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);

2273

pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);

2274

if (!pte)

2274

if (!pte)

2275

return -ENOMEM;

2275

return -ENOMEM;

2276

arch_enter_lazy_mmu_mode();

2276

arch_enter_lazy_mmu_mode();

2277

do {

2277

do {

2278

BUG_ON(!pte_none(*pte));

2278

BUG_ON(!pte_none(*pte));

2279

set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));

2279

set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));

2280

pfn++;

2280

pfn++;

2281

} while (pte++, addr += PAGE_SIZE, addr != end);

2281

} while (pte++, addr += PAGE_SIZE, addr != end);

2282

arch_leave_lazy_mmu_mode();

2282

arch_leave_lazy_mmu_mode();

2283

pte_unmap_unlock(pte - 1, ptl);

2283

pte_unmap_unlock(pte - 1, ptl);

2284

return 0;

2284

return 0;

2285

}

2285

}

2286

2287

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,

2287

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,

2288

unsigned long addr, unsigned long end,

2288

unsigned long addr, unsigned long end,

2289

unsigned long pfn, pgprot_t prot)

2289

unsigned long pfn, pgprot_t prot)

2290

{

2290

{

2291

pmd_t *pmd;

2291

pmd_t *pmd;

2292

unsigned long next;

2292

unsigned long next;

2293

2294

pfn -= addr >> PAGE_SHIFT;

2294

pfn -= addr >> PAGE_SHIFT;

2295

pmd = pmd_alloc(mm, pud, addr);

2295

pmd = pmd_alloc(mm, pud, addr);

2296

if (!pmd)

2296

if (!pmd)

2297

return -ENOMEM;

2297

return -ENOMEM;

2298

VM_BUG_ON(pmd_trans_huge(*pmd));

2298

VM_BUG_ON(pmd_trans_huge(*pmd));

2299

do {

2299

do {

2300

next = pmd_addr_end(addr, end);

2300

next = pmd_addr_end(addr, end);

2301

if (remap_pte_range(mm, pmd, addr, next,

2301

if (remap_pte_range(mm, pmd, addr, next,

2302

pfn + (addr >> PAGE_SHIFT), prot))

2302

pfn + (addr >> PAGE_SHIFT), prot))

2303

return -ENOMEM;

2303

return -ENOMEM;

2304

} while (pmd++, addr = next, addr != end);

2304

} while (pmd++, addr = next, addr != end);

2305

return 0;

2305

return 0;

2306

}

2306

}

2307

2308

static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,

2308

static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,

2309

unsigned long addr, unsigned long end,

2309

unsigned long addr, unsigned long end,

2310

unsigned long pfn, pgprot_t prot)

2310

unsigned long pfn, pgprot_t prot)

2311

{

2311

{

2312

pud_t *pud;

2312

pud_t *pud;

2313

unsigned long next;

2313

unsigned long next;

2314

2315

pfn -= addr >> PAGE_SHIFT;

2315

pfn -= addr >> PAGE_SHIFT;

2316

pud = pud_alloc(mm, pgd, addr);

2316

pud = pud_alloc(mm, pgd, addr);

2317

if (!pud)

2317

if (!pud)

2318

return -ENOMEM;

2318

return -ENOMEM;

2319

do {

2319

do {

2320

next = pud_addr_end(addr, end);

2320

next = pud_addr_end(addr, end);

2321

if (remap_pmd_range(mm, pud, addr, next,

2321

if (remap_pmd_range(mm, pud, addr, next,

2322

pfn + (addr >> PAGE_SHIFT), prot))

2322

pfn + (addr >> PAGE_SHIFT), prot))

2323

return -ENOMEM;

2323

return -ENOMEM;

2324

} while (pud++, addr = next, addr != end);

2324

} while (pud++, addr = next, addr != end);

2325

return 0;

2325

return 0;

2326

}

2326

}

2327

2328

/**

2328

/**

2329

* remap_pfn_range - remap kernel memory to userspace

2329

* remap_pfn_range - remap kernel memory to userspace

2330

* @vma: user vma to map to

2330

* @vma: user vma to map to

2331

* @addr: target user address to start at

2331

* @addr: target user address to start at

2332

* @pfn: physical address of kernel memory

2332

* @pfn: physical address of kernel memory

2333

* @size: size of map area

2333

* @size: size of map area

2334

* @prot: page protection flags for this mapping

2334

* @prot: page protection flags for this mapping

2335

*

2335

*

2336

* Note: this is only safe if the mm semaphore is held when called.

2336

* Note: this is only safe if the mm semaphore is held when called.

2337

*/

2337

*/

2338

int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,

2338

int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,

2339

unsigned long pfn, unsigned long size, pgprot_t prot)

2339

unsigned long pfn, unsigned long size, pgprot_t prot)

2340

{

2340

{

2341

pgd_t *pgd;

2341

pgd_t *pgd;

2342

unsigned long next;

2342

unsigned long next;

2343

unsigned long end = addr + PAGE_ALIGN(size);

2343

unsigned long end = addr + PAGE_ALIGN(size);

2344

struct mm_struct *mm = vma->vm_mm;

2344

struct mm_struct *mm = vma->vm_mm;

2345

int err;

2345

int err;

2346

2347

/*

2347

/*

2348

* Physically remapped pages are special. Tell the

2348

* Physically remapped pages are special. Tell the

2349

* rest of the world about it:

2349

* rest of the world about it:

2350

* VM_IO tells people not to look at these pages

2350

* VM_IO tells people not to look at these pages

2351

* (accesses can have side effects).

2351

* (accesses can have side effects).

2352

* VM_PFNMAP tells the core MM that the base pages are just

2352

* VM_PFNMAP tells the core MM that the base pages are just

2353

* raw PFN mappings, and do not have a "struct page" associated

2353

* raw PFN mappings, and do not have a "struct page" associated

2354

* with them.

2354

* with them.

2355

* VM_DONTEXPAND

2355

* VM_DONTEXPAND

2356

* Disable vma merging and expanding with mremap().

2356

* Disable vma merging and expanding with mremap().

2357

* VM_DONTDUMP

2357

* VM_DONTDUMP

2358

* Omit vma from core dump, even when VM_IO turned off.

2358

* Omit vma from core dump, even when VM_IO turned off.

2359

*

2359

*

2360

* There's a horrible special case to handle copy-on-write

2360

* There's a horrible special case to handle copy-on-write

2361

* behaviour that some programs depend on. We mark the "original"

2361

* behaviour that some programs depend on. We mark the "original"

2362

* un-COW'ed pages by matching them up with "vma->vm_pgoff".

2362

* un-COW'ed pages by matching them up with "vma->vm_pgoff".

2363

* See vm_normal_page() for details.

2363

* See vm_normal_page() for details.

2364

*/

2364

*/

2365

if (is_cow_mapping(vma->vm_flags)) {

2365

if (is_cow_mapping(vma->vm_flags)) {

2366

if (addr != vma->vm_start || end != vma->vm_end)

2366

if (addr != vma->vm_start || end != vma->vm_end)

2367

return -EINVAL;

2367

return -EINVAL;

2368

vma->vm_pgoff = pfn;

2368

vma->vm_pgoff = pfn;

2369

}

2369

}

2370

2371

err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));

2371

err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));

2372

if (err)

2372

if (err)

2373

return -EINVAL;

2373

return -EINVAL;

2374

2375

vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;

2375

vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;

2376

2377

BUG_ON(addr >= end);

2377

BUG_ON(addr >= end);

2378

pfn -= addr >> PAGE_SHIFT;

2378

pfn -= addr >> PAGE_SHIFT;

2379

pgd = pgd_offset(mm, addr);

2379

pgd = pgd_offset(mm, addr);

2380

flush_cache_range(vma, addr, end);

2380

flush_cache_range(vma, addr, end);

2381

do {

2381

do {

2382

next = pgd_addr_end(addr, end);

2382

next = pgd_addr_end(addr, end);

2383

err = remap_pud_range(mm, pgd, addr, next,

2383

err = remap_pud_range(mm, pgd, addr, next,

2384

pfn + (addr >> PAGE_SHIFT), prot);

2384

pfn + (addr >> PAGE_SHIFT), prot);

2385

if (err)

2385

if (err)

2386

break;

2386

break;

2387

} while (pgd++, addr = next, addr != end);

2387

} while (pgd++, addr = next, addr != end);

2388

2389

if (err)

2389

if (err)

2390

untrack_pfn(vma, pfn, PAGE_ALIGN(size));

2390

untrack_pfn(vma, pfn, PAGE_ALIGN(size));

2391

2392

return err;

2392

return err;

2393

}

2393

}

2394

EXPORT_SYMBOL(remap_pfn_range);

2394

EXPORT_SYMBOL(remap_pfn_range);

2395

2396

/**

2396

/**

2397

* vm_iomap_memory - remap memory to userspace

2397

* vm_iomap_memory - remap memory to userspace

2398

* @vma: user vma to map to

2398

* @vma: user vma to map to

2399

* @start: start of area

2399

* @start: start of area

2400

* @len: size of area

2400

* @len: size of area

2401

*

2401

*

2402

* This is a simplified io_remap_pfn_range() for common driver use. The

2402

* This is a simplified io_remap_pfn_range() for common driver use. The

2403

* driver just needs to give us the physical memory range to be mapped,

2403

* driver just needs to give us the physical memory range to be mapped,

2404

* we'll figure out the rest from the vma information.

2404

* we'll figure out the rest from the vma information.

2405

*

2405

*

2406

* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get

2406

* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get

2407

* whatever write-combining details or similar.

2407

* whatever write-combining details or similar.

2408

*/

2408

*/

2409

int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)

2409

int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)

2410

{

2410

{

2411

unsigned long vm_len, pfn, pages;

2411

unsigned long vm_len, pfn, pages;

2412

2413

/* Check that the physical memory area passed in looks valid */

2413

/* Check that the physical memory area passed in looks valid */

2414

if (start + len < start)

2414

if (start + len < start)

2415

return -EINVAL;

2415

return -EINVAL;

2416

/*

2416

/*

2417

* You *really* shouldn't map things that aren't page-aligned,

2417

* You *really* shouldn't map things that aren't page-aligned,

2418

* but we've historically allowed it because IO memory might

2418

* but we've historically allowed it because IO memory might

2419

* just have smaller alignment.

2419

* just have smaller alignment.

2420

*/

2420

*/

2421

len += start & ~PAGE_MASK;

2421

len += start & ~PAGE_MASK;

2422

pfn = start >> PAGE_SHIFT;

2422

pfn = start >> PAGE_SHIFT;

2423

pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;

2423

pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;

2424

if (pfn + pages < pfn)

2424

if (pfn + pages < pfn)

2425

return -EINVAL;

2425

return -EINVAL;

2426

2427

/* We start the mapping 'vm_pgoff' pages into the area */

2427

/* We start the mapping 'vm_pgoff' pages into the area */

2428

if (vma->vm_pgoff > pages)

2428

if (vma->vm_pgoff > pages)

2429

return -EINVAL;

2429

return -EINVAL;

2430

pfn += vma->vm_pgoff;

2430

pfn += vma->vm_pgoff;

2431

pages -= vma->vm_pgoff;

2431

pages -= vma->vm_pgoff;

2432

2433

/* Can we fit all of the mapping? */

2433

/* Can we fit all of the mapping? */

2434

vm_len = vma->vm_end - vma->vm_start;

2434

vm_len = vma->vm_end - vma->vm_start;

2435

if (vm_len >> PAGE_SHIFT > pages)

2435

if (vm_len >> PAGE_SHIFT > pages)

2436

return -EINVAL;

2436

return -EINVAL;

2437

2438

/* Ok, let it rip */

2438

/* Ok, let it rip */

2439

return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);

2439

return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);

2440

}

2440

}

2441

EXPORT_SYMBOL(vm_iomap_memory);

2441

EXPORT_SYMBOL(vm_iomap_memory);

2442

2443

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

2443

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

2444

unsigned long addr, unsigned long end,

2444

unsigned long addr, unsigned long end,

2445

pte_fn_t fn, void *data)

2445

pte_fn_t fn, void *data)

2446

{

2446

{

2447

pte_t *pte;

2447

pte_t *pte;

2448

int err;

2448

int err;

2449

pgtable_t token;

2449

pgtable_t token;

2450

spinlock_t *uninitialized_var(ptl);

2450

spinlock_t *uninitialized_var(ptl);

2451

2452

pte = (mm == &init_mm) ?

2452

pte = (mm == &init_mm) ?

2453

pte_alloc_kernel(pmd, addr) :

2453

pte_alloc_kernel(pmd, addr) :

2454

pte_alloc_map_lock(mm, pmd, addr, &ptl);

2454

pte_alloc_map_lock(mm, pmd, addr, &ptl);

2455

if (!pte)

2455

if (!pte)

2456

return -ENOMEM;

2456

return -ENOMEM;

2457

2458

BUG_ON(pmd_huge(*pmd));

2458

BUG_ON(pmd_huge(*pmd));

2459

2460

arch_enter_lazy_mmu_mode();

2460

arch_enter_lazy_mmu_mode();

2461

2462

token = pmd_pgtable(*pmd);

2462

token = pmd_pgtable(*pmd);

2463

2464

do {

2464

do {

2465

err = fn(pte++, token, addr, data);

2465

err = fn(pte++, token, addr, data);

2466

if (err)

2466

if (err)

2467

break;

2467

break;

2468

} while (addr += PAGE_SIZE, addr != end);

2468

} while (addr += PAGE_SIZE, addr != end);

2469

2470

arch_leave_lazy_mmu_mode();

2470

arch_leave_lazy_mmu_mode();

2471

2472

if (mm != &init_mm)

2472

if (mm != &init_mm)

2473

pte_unmap_unlock(pte-1, ptl);

2473

pte_unmap_unlock(pte-1, ptl);

2474

return err;

2474

return err;

2475

}

2475

}

2476

2477

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,

2477

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,

2478

unsigned long addr, unsigned long end,

2478

unsigned long addr, unsigned long end,

2479

pte_fn_t fn, void *data)

2479

pte_fn_t fn, void *data)

2480

{

2480

{

2481

pmd_t *pmd;

2481

pmd_t *pmd;

2482

unsigned long next;

2482

unsigned long next;

2483

int err;

2483

int err;

2484

2485

BUG_ON(pud_huge(*pud));

2485

BUG_ON(pud_huge(*pud));

2486

2487

pmd = pmd_alloc(mm, pud, addr);

2487

pmd = pmd_alloc(mm, pud, addr);

2488

if (!pmd)

2488

if (!pmd)

2489

return -ENOMEM;

2489

return -ENOMEM;

2490

do {

2490

do {

2491

next = pmd_addr_end(addr, end);

2491

next = pmd_addr_end(addr, end);

2492

err = apply_to_pte_range(mm, pmd, addr, next, fn, data);

2492

err = apply_to_pte_range(mm, pmd, addr, next, fn, data);

2493

if (err)

2493

if (err)

2494

break;

2494

break;

2495

} while (pmd++, addr = next, addr != end);

2495

} while (pmd++, addr = next, addr != end);

2496

return err;

2496

return err;

2497

}

2497

}

2498

2499

static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,

2499

static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,

2500

unsigned long addr, unsigned long end,

2500

unsigned long addr, unsigned long end,

2501

pte_fn_t fn, void *data)

2501

pte_fn_t fn, void *data)

2502

{

2502

{

2503

pud_t *pud;

2503

pud_t *pud;

2504

unsigned long next;

2504

unsigned long next;

2505

int err;

2505

int err;

2506

2507

pud = pud_alloc(mm, pgd, addr);

2507

pud = pud_alloc(mm, pgd, addr);

2508

if (!pud)

2508

if (!pud)

2509

return -ENOMEM;

2509

return -ENOMEM;

2510

do {

2510

do {

2511

next = pud_addr_end(addr, end);

2511

next = pud_addr_end(addr, end);

2512

err = apply_to_pmd_range(mm, pud, addr, next, fn, data);

2512

err = apply_to_pmd_range(mm, pud, addr, next, fn, data);

2513

if (err)

2513

if (err)

2514

break;

2514

break;

2515

} while (pud++, addr = next, addr != end);

2515

} while (pud++, addr = next, addr != end);

2516

return err;

2516

return err;

2517

}

2517

}

2518

2519

/*

2519

/*

2520

* Scan a region of virtual memory, filling in page tables as necessary

2520

* Scan a region of virtual memory, filling in page tables as necessary

2521

* and calling a provided function on each leaf page table.

2521

* and calling a provided function on each leaf page table.

2522

*/

2522

*/

2523

int apply_to_page_range(struct mm_struct *mm, unsigned long addr,

2523

int apply_to_page_range(struct mm_struct *mm, unsigned long addr,

2524

unsigned long size, pte_fn_t fn, void *data)

2524

unsigned long size, pte_fn_t fn, void *data)

2525

{

2525

{

2526

pgd_t *pgd;

2526

pgd_t *pgd;

2527

unsigned long next;

2527

unsigned long next;

2528

unsigned long end = addr + size;

2528

unsigned long end = addr + size;

2529

int err;

2529

int err;

2530

2531

BUG_ON(addr >= end);

2531

BUG_ON(addr >= end);

2532

pgd = pgd_offset(mm, addr);

2532

pgd = pgd_offset(mm, addr);

2533

do {

2533

do {

2534

next = pgd_addr_end(addr, end);

2534

next = pgd_addr_end(addr, end);

2535

err = apply_to_pud_range(mm, pgd, addr, next, fn, data);

2535

err = apply_to_pud_range(mm, pgd, addr, next, fn, data);

2536

if (err)

2536

if (err)

2537

break;

2537

break;

2538

} while (pgd++, addr = next, addr != end);

2538

} while (pgd++, addr = next, addr != end);

2539

2540

return err;

2540

return err;

2541

}

2541

}

2542

EXPORT_SYMBOL_GPL(apply_to_page_range);

2542

EXPORT_SYMBOL_GPL(apply_to_page_range);

2543

2544

/*

2544

/*

2545

* handle_pte_fault chooses page fault handler according to an entry

2545

* handle_pte_fault chooses page fault handler according to an entry

2546

* which was read non-atomically. Before making any commitment, on

2546

* which was read non-atomically. Before making any commitment, on

2547

* those architectures or configurations (e.g. i386 with PAE) which

2547

* those architectures or configurations (e.g. i386 with PAE) which

2548

* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault

2548

* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault

2549

* must check under lock before unmapping the pte and proceeding

2549

* must check under lock before unmapping the pte and proceeding

2550

* (but do_wp_page is only called after already making such a check;

2550

* (but do_wp_page is only called after already making such a check;

2551

* and do_anonymous_page can safely check later on).

2551

* and do_anonymous_page can safely check later on).

2552

*/

2552

*/

2553

static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,

2553

static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,

2554

pte_t *page_table, pte_t orig_pte)

2554

pte_t *page_table, pte_t orig_pte)

2555

{

2555

{

2556

int same = 1;

2556

int same = 1;

2557

#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)

2557

#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)

2558

if (sizeof(pte_t) > sizeof(unsigned long)) {

2558

if (sizeof(pte_t) > sizeof(unsigned long)) {

2559

spinlock_t *ptl = pte_lockptr(mm, pmd);

2559

spinlock_t *ptl = pte_lockptr(mm, pmd);

2560

spin_lock(ptl);

2560

spin_lock(ptl);

2561

same = pte_same(*page_table, orig_pte);

2561

same = pte_same(*page_table, orig_pte);

2562

spin_unlock(ptl);

2562

spin_unlock(ptl);

2563

}

2563

}

2564

#endif

2564

#endif

2565

pte_unmap(page_table);

2565

pte_unmap(page_table);

2566

return same;

2566

return same;

2567

}

2567

}

2568

2569

static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)

2569

static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)

2570

{

2570

{

2571

/*

2571

/*

2572

* If the source page was a PFN mapping, we don't have

2572

* If the source page was a PFN mapping, we don't have

2573

* a "struct page" for it. We do a best-effort copy by

2573

* a "struct page" for it. We do a best-effort copy by

2574

* just copying from the original user address. If that

2574

* just copying from the original user address. If that

2575

* fails, we just zero-fill it. Live with it.

2575

* fails, we just zero-fill it. Live with it.

2576

*/

2576

*/

2577

if (unlikely(!src)) {

2577

if (unlikely(!src)) {

2578

void *kaddr = kmap_atomic(dst);

2578

void *kaddr = kmap_atomic(dst);

2579

void __user *uaddr = (void __user *)(va & PAGE_MASK);

2579

void __user *uaddr = (void __user *)(va & PAGE_MASK);

2580

2581

/*

2581

/*

2582

* This really shouldn't fail, because the page is there

2582

* This really shouldn't fail, because the page is there

2583

* in the page tables. But it might just be unreadable,

2583

* in the page tables. But it might just be unreadable,

2584

* in which case we just give up and fill the result with

2584

* in which case we just give up and fill the result with

2585

* zeroes.

2585

* zeroes.

2586

*/

2586

*/

2587

if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))

2587

if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))

2588

clear_page(kaddr);

2588

clear_page(kaddr);

2589

kunmap_atomic(kaddr);

2589

kunmap_atomic(kaddr);

2590

flush_dcache_page(dst);

2590

flush_dcache_page(dst);

2591

} else

2591

} else

2592

copy_user_highpage(dst, src, va, vma);

2592

copy_user_highpage(dst, src, va, vma);

2593

}

2593

}

2594

2595

/*

2595

/*

2596

* This routine handles present pages, when users try to write

2596

* This routine handles present pages, when users try to write

2597

* to a shared page. It is done by copying the page to a new address

2597

* to a shared page. It is done by copying the page to a new address

2598

* and decrementing the shared-page counter for the old page.

2598

* and decrementing the shared-page counter for the old page.

2599

*

2599

*

2600

* Note that this routine assumes that the protection checks have been

2600

* Note that this routine assumes that the protection checks have been

2601

* done by the caller (the low-level page fault routine in most cases).

2601

* done by the caller (the low-level page fault routine in most cases).

2602

* Thus we can safely just mark it writable once we've done any necessary

2602

* Thus we can safely just mark it writable once we've done any necessary

2603

* COW.

2603

* COW.

2604

*

2604

*

2605

* We also mark the page dirty at this point even though the page will

2605

* We also mark the page dirty at this point even though the page will

2606

* change only once the write actually happens. This avoids a few races,

2606

* change only once the write actually happens. This avoids a few races,

2607

* and potentially makes it more efficient.

2607

* and potentially makes it more efficient.

2608

*

2608

*

2609

* We enter with non-exclusive mmap_sem (to exclude vma changes,

2609

* We enter with non-exclusive mmap_sem (to exclude vma changes,

2610

* but allow concurrent faults), with pte both mapped and locked.

2610

* but allow concurrent faults), with pte both mapped and locked.

2611

* We return with mmap_sem still held, but pte unmapped and unlocked.

2611

* We return with mmap_sem still held, but pte unmapped and unlocked.

2612

*/

2612

*/

2613

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

2613

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

2614

unsigned long address, pte_t *page_table, pmd_t *pmd,

2614

unsigned long address, pte_t *page_table, pmd_t *pmd,

2615

spinlock_t *ptl, pte_t orig_pte)

2615

spinlock_t *ptl, pte_t orig_pte)

2616

__releases(ptl)

2616

__releases(ptl)

2617

{

2617

{

2618

struct page *old_page, *new_page = NULL;

2618

struct page *old_page, *new_page = NULL;

2619

pte_t entry;

2619

pte_t entry;

2620

int ret = 0;

2620

int ret = 0;

2621

int page_mkwrite = 0;

2621

int page_mkwrite = 0;

2622

struct page *dirty_page = NULL;

2622

struct page *dirty_page = NULL;

2623

unsigned long mmun_start = 0; /* For mmu_notifiers */

2623

unsigned long mmun_start = 0; /* For mmu_notifiers */

2624

unsigned long mmun_end = 0; /* For mmu_notifiers */

2624

unsigned long mmun_end = 0; /* For mmu_notifiers */

2625

2626

old_page = vm_normal_page(vma, address, orig_pte);

2626

old_page = vm_normal_page(vma, address, orig_pte);

2627

if (!old_page) {

2627

if (!old_page) {

2628

/*

2628

/*

2629

* VM_MIXEDMAP !pfn_valid() case

2629

* VM_MIXEDMAP !pfn_valid() case

2630

*

2630

*

2631

* We should not cow pages in a shared writeable mapping.

2631

* We should not cow pages in a shared writeable mapping.

2632

* Just mark the pages writable as we can't do any dirty

2632

* Just mark the pages writable as we can't do any dirty

2633

* accounting on raw pfn maps.

2633

* accounting on raw pfn maps.

2634

*/

2634

*/

2635

if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

2635

if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

2636

(VM_WRITE|VM_SHARED))

2636

(VM_WRITE|VM_SHARED))

2637

goto reuse;

2637

goto reuse;

2638

goto gotten;

2638

goto gotten;

2639

}

2639

}

2640

2641

/*

2641

/*

2642

* Take out anonymous pages first, anonymous shared vmas are

2642

* Take out anonymous pages first, anonymous shared vmas are

2643

* not dirty accountable.

2643

* not dirty accountable.

2644

*/

2644

*/

2645

if (PageAnon(old_page) && !PageKsm(old_page)) {

2645

if (PageAnon(old_page) && !PageKsm(old_page)) {

2646

if (!trylock_page(old_page)) {

2646

if (!trylock_page(old_page)) {

2647

page_cache_get(old_page);

2647

page_cache_get(old_page);

2648

pte_unmap_unlock(page_table, ptl);

2648

pte_unmap_unlock(page_table, ptl);

2649

lock_page(old_page);

2649

lock_page(old_page);

2650

page_table = pte_offset_map_lock(mm, pmd, address,

2650

page_table = pte_offset_map_lock(mm, pmd, address,

2651

&ptl);

2651

&ptl);

2652

if (!pte_same(*page_table, orig_pte)) {

2652

if (!pte_same(*page_table, orig_pte)) {

2653

unlock_page(old_page);

2653

unlock_page(old_page);

2654

goto unlock;

2654

goto unlock;

2655

}

2655

}

2656

page_cache_release(old_page);

2656

page_cache_release(old_page);

2657

}

2657

}

2658

if (reuse_swap_page(old_page)) {

2658

if (reuse_swap_page(old_page)) {

2659

/*

2659

/*

2660

* The page is all ours. Move it to our anon_vma so

2660

* The page is all ours. Move it to our anon_vma so

2661

* the rmap code will not search our parent or siblings.

2661

* the rmap code will not search our parent or siblings.

2662

* Protected against the rmap code by the page lock.

2662

* Protected against the rmap code by the page lock.

2663

*/

2663

*/

2664

page_move_anon_rmap(old_page, vma, address);

2664

page_move_anon_rmap(old_page, vma, address);

2665

unlock_page(old_page);

2665

unlock_page(old_page);

2666

goto reuse;

2666

goto reuse;

2667

}

2667

}

2668

unlock_page(old_page);

2668

unlock_page(old_page);

2669

} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

2669

} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==

2670

(VM_WRITE|VM_SHARED))) {

2670

(VM_WRITE|VM_SHARED))) {

2671

/*

2671

/*

2672

* Only catch write-faults on shared writable pages,

2672

* Only catch write-faults on shared writable pages,

2673

* read-only shared pages can get COWed by

2673

* read-only shared pages can get COWed by

2674

* get_user_pages(.write=1, .force=1).

2674

* get_user_pages(.write=1, .force=1).

2675

*/

2675

*/

2676

if (vma->vm_ops && vma->vm_ops->page_mkwrite) {

2676

if (vma->vm_ops && vma->vm_ops->page_mkwrite) {

2677

struct vm_fault vmf;

2677

struct vm_fault vmf;

2678

int tmp;

2678

int tmp;

2679

2680

vmf.virtual_address = (void __user *)(address &

2680

vmf.virtual_address = (void __user *)(address &

2681

PAGE_MASK);

2681

PAGE_MASK);

2682

vmf.pgoff = old_page->index;

2682

vmf.pgoff = old_page->index;

2683

vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

2683

vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

2684

vmf.page = old_page;

2684

vmf.page = old_page;

2685

2686

/*

2686

/*

2687

* Notify the address space that the page is about to

2687

* Notify the address space that the page is about to

2688

* become writable so that it can prohibit this or wait

2688

* become writable so that it can prohibit this or wait

2689

* for the page to get into an appropriate state.

2689

* for the page to get into an appropriate state.

2690

*

2690

*

2691

* We do this without the lock held, so that it can

2691

* We do this without the lock held, so that it can

2692

* sleep if it needs to.

2692

* sleep if it needs to.

2693

*/

2693

*/

2694

page_cache_get(old_page);

2694

page_cache_get(old_page);

2695

pte_unmap_unlock(page_table, ptl);

2695

pte_unmap_unlock(page_table, ptl);

2696

2697

tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

2697

tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

2698

if (unlikely(tmp &

2698

if (unlikely(tmp &

2699

(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

2699

(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

2700

ret = tmp;

2700

ret = tmp;

2701

goto unwritable_page;

2701

goto unwritable_page;

2702

}

2702

}

2703

if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

2703

if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

2704

lock_page(old_page);

2704

lock_page(old_page);

2705

if (!old_page->mapping) {

2705

if (!old_page->mapping) {

2706

ret = 0; /* retry the fault */

2706

ret = 0; /* retry the fault */

2707

unlock_page(old_page);

2707

unlock_page(old_page);

2708

goto unwritable_page;

2708

goto unwritable_page;

2709

}

2709

}

2710

} else

2710

} else

2711

VM_BUG_ON(!PageLocked(old_page));

2711

VM_BUG_ON(!PageLocked(old_page));

2712

2713

/*

2713

/*

2714

* Since we dropped the lock we need to revalidate

2714

* Since we dropped the lock we need to revalidate

2715

* the PTE as someone else may have changed it. If

2715

* the PTE as someone else may have changed it. If

2716

* they did, we just return, as we can count on the

2716

* they did, we just return, as we can count on the

2717

* MMU to tell us if they didn't also make it writable.

2717

* MMU to tell us if they didn't also make it writable.

2718

*/

2718

*/

2719

page_table = pte_offset_map_lock(mm, pmd, address,

2719

page_table = pte_offset_map_lock(mm, pmd, address,

2720

&ptl);

2720

&ptl);

2721

if (!pte_same(*page_table, orig_pte)) {

2721

if (!pte_same(*page_table, orig_pte)) {

2722

unlock_page(old_page);

2722

unlock_page(old_page);

2723

goto unlock;

2723

goto unlock;

2724

}

2724

}

2725

2726

page_mkwrite = 1;

2726

page_mkwrite = 1;

2727

}

2727

}

2728

dirty_page = old_page;

2728

dirty_page = old_page;

2729

get_page(dirty_page);

2729

get_page(dirty_page);

2730

2731

reuse:

2731

reuse:

2732

flush_cache_page(vma, address, pte_pfn(orig_pte));

2732

flush_cache_page(vma, address, pte_pfn(orig_pte));

2733

entry = pte_mkyoung(orig_pte);

2733

entry = pte_mkyoung(orig_pte);

2734

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

2734

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

2735

if (ptep_set_access_flags(vma, address, page_table, entry,1))

2735

if (ptep_set_access_flags(vma, address, page_table, entry,1))

2736

update_mmu_cache(vma, address, page_table);

2736

update_mmu_cache(vma, address, page_table);

2737

pte_unmap_unlock(page_table, ptl);

2737

pte_unmap_unlock(page_table, ptl);

2738

ret |= VM_FAULT_WRITE;

2738

ret |= VM_FAULT_WRITE;

2739

2740

if (!dirty_page)

2740

if (!dirty_page)

2741

return ret;

2741

return ret;

2742

2743

/*

2743

/*

2744

* Yes, Virginia, this is actually required to prevent a race

2744

* Yes, Virginia, this is actually required to prevent a race

2745

* with clear_page_dirty_for_io() from clearing the page dirty

2745

* with clear_page_dirty_for_io() from clearing the page dirty

2746

* bit after it clear all dirty ptes, but before a racing

2746

* bit after it clear all dirty ptes, but before a racing

2747

* do_wp_page installs a dirty pte.

2747

* do_wp_page installs a dirty pte.

2748

*

2748

*

2749

* __do_fault is protected similarly.

2749

* __do_fault is protected similarly.

2750

*/

2750

*/

2751

if (!page_mkwrite) {

2751

if (!page_mkwrite) {

2752

wait_on_page_locked(dirty_page);

2752

wait_on_page_locked(dirty_page);

2753

set_page_dirty_balance(dirty_page, page_mkwrite);

2753

set_page_dirty_balance(dirty_page, page_mkwrite);

2754

/* file_update_time outside page_lock */

2754

/* file_update_time outside page_lock */

2755

if (vma->vm_file)

2755

if (vma->vm_file)

2756

file_update_time(vma->vm_file);

2756

file_update_time(vma->vm_file);

2757

}

2757

}

2758

put_page(dirty_page);

2758

put_page(dirty_page);

2759

if (page_mkwrite) {

2759

if (page_mkwrite) {

2760

struct address_space *mapping = dirty_page->mapping;

2760

struct address_space *mapping = dirty_page->mapping;

2761

2762

set_page_dirty(dirty_page);

2762

set_page_dirty(dirty_page);

2763

unlock_page(dirty_page);

2763

unlock_page(dirty_page);

2764

page_cache_release(dirty_page);

2764

page_cache_release(dirty_page);

2765

if (mapping) {

2765

if (mapping) {

2766

/*

2766

/*

2767

* Some device drivers do not set page.mapping

2767

* Some device drivers do not set page.mapping

2768

* but still dirty their pages

2768

* but still dirty their pages

2769

*/

2769

*/

2770

balance_dirty_pages_ratelimited(mapping);

2770

balance_dirty_pages_ratelimited(mapping);

2771

}

2771

}

2772

}

2772

}

2773

2774

return ret;

2774

return ret;

2775

}

2775

}

2776

2777

/*

2777

/*

2778

* Ok, we need to copy. Oh, well..

2778

* Ok, we need to copy. Oh, well..

2779

*/

2779

*/

2780

page_cache_get(old_page);

2780

page_cache_get(old_page);

2781

gotten:

2781

gotten:

2782

pte_unmap_unlock(page_table, ptl);

2782

pte_unmap_unlock(page_table, ptl);

2783

2784

if (unlikely(anon_vma_prepare(vma)))

2784

if (unlikely(anon_vma_prepare(vma)))

2785

goto oom;

2785

goto oom;

2786

2787

if (is_zero_pfn(pte_pfn(orig_pte))) {

2787

if (is_zero_pfn(pte_pfn(orig_pte))) {

2788

new_page = alloc_zeroed_user_highpage_movable(vma, address);

2788

new_page = alloc_zeroed_user_highpage_movable(vma, address);

2789

if (!new_page)

2789

if (!new_page)

2790

goto oom;

2790

goto oom;

2791

} else {

2791

} else {

2792

new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

2792

new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

2793

if (!new_page)

2793

if (!new_page)

2794

goto oom;

2794

goto oom;

2795

cow_user_page(new_page, old_page, address, vma);

2795

cow_user_page(new_page, old_page, address, vma);

2796

}

2796

}

2797

__SetPageUptodate(new_page);

2797

__SetPageUptodate(new_page);

2798

2799

if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))

2799

if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))

2800

goto oom_free_new;

2800

goto oom_free_new;

2801

2802

mmun_start = address & PAGE_MASK;

2802

mmun_start = address & PAGE_MASK;

2803

mmun_end = mmun_start + PAGE_SIZE;

2803

mmun_end = mmun_start + PAGE_SIZE;

2804

mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

2804

mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

2805

2806

/*

2806

/*

2807

* Re-check the pte - we dropped the lock

2807

* Re-check the pte - we dropped the lock

2808

*/

2808

*/

2809

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

2809

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

2810

if (likely(pte_same(*page_table, orig_pte))) {

2810

if (likely(pte_same(*page_table, orig_pte))) {

2811

if (old_page) {

2811

if (old_page) {

2812

if (!PageAnon(old_page)) {

2812

if (!PageAnon(old_page)) {

2813

dec_mm_counter_fast(mm, MM_FILEPAGES);

2813

dec_mm_counter_fast(mm, MM_FILEPAGES);

2814

inc_mm_counter_fast(mm, MM_ANONPAGES);

2814

inc_mm_counter_fast(mm, MM_ANONPAGES);

2815

}

2815

}

2816

} else

2816

} else

2817

inc_mm_counter_fast(mm, MM_ANONPAGES);

2817

inc_mm_counter_fast(mm, MM_ANONPAGES);

2818

flush_cache_page(vma, address, pte_pfn(orig_pte));

2818

flush_cache_page(vma, address, pte_pfn(orig_pte));

2819

entry = mk_pte(new_page, vma->vm_page_prot);

2819

entry = mk_pte(new_page, vma->vm_page_prot);

2820

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

2820

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

2821

/*

2821

/*

2822

* Clear the pte entry and flush it first, before updating the

2822

* Clear the pte entry and flush it first, before updating the

2823

* pte with the new entry. This will avoid a race condition

2823

* pte with the new entry. This will avoid a race condition

2824

* seen in the presence of one thread doing SMC and another

2824

* seen in the presence of one thread doing SMC and another

2825

* thread doing COW.

2825

* thread doing COW.

2826

*/

2826

*/

2827

ptep_clear_flush(vma, address, page_table);

2827

ptep_clear_flush(vma, address, page_table);

2828

page_add_new_anon_rmap(new_page, vma, address);

2828

page_add_new_anon_rmap(new_page, vma, address);

2829

/*

2829

/*

2830

* We call the notify macro here because, when using secondary

2830

* We call the notify macro here because, when using secondary

2831

* mmu page tables (such as kvm shadow page tables), we want the

2831

* mmu page tables (such as kvm shadow page tables), we want the

2832

* new page to be mapped directly into the secondary page table.

2832

* new page to be mapped directly into the secondary page table.

2833

*/

2833

*/

2834

set_pte_at_notify(mm, address, page_table, entry);

2834

set_pte_at_notify(mm, address, page_table, entry);

2835

update_mmu_cache(vma, address, page_table);

2835

update_mmu_cache(vma, address, page_table);

2836

if (old_page) {

2836

if (old_page) {

2837

/*

2837

/*

2838

* Only after switching the pte to the new page may

2838

* Only after switching the pte to the new page may

2839

* we remove the mapcount here. Otherwise another

2839

* we remove the mapcount here. Otherwise another

2840

* process may come and find the rmap count decremented

2840

* process may come and find the rmap count decremented

2841

* before the pte is switched to the new page, and

2841

* before the pte is switched to the new page, and

2842

* "reuse" the old page writing into it while our pte

2842

* "reuse" the old page writing into it while our pte

2843

* here still points into it and can be read by other

2843

* here still points into it and can be read by other

2844

* threads.

2844

* threads.

2845

*

2845

*

2846

* The critical issue is to order this

2846

* The critical issue is to order this

2847

* page_remove_rmap with the ptp_clear_flush above.

2847

* page_remove_rmap with the ptp_clear_flush above.

2848

* Those stores are ordered by (if nothing else,)

2848

* Those stores are ordered by (if nothing else,)

2849

* the barrier present in the atomic_add_negative

2849

* the barrier present in the atomic_add_negative

2850

* in page_remove_rmap.

2850

* in page_remove_rmap.

2851

*

2851

*

2852

* Then the TLB flush in ptep_clear_flush ensures that

2852

* Then the TLB flush in ptep_clear_flush ensures that

2853

* no process can access the old page before the

2853

* no process can access the old page before the

2854

* decremented mapcount is visible. And the old page

2854

* decremented mapcount is visible. And the old page

2855

* cannot be reused until after the decremented

2855

* cannot be reused until after the decremented

2856

* mapcount is visible. So transitively, TLBs to

2856

* mapcount is visible. So transitively, TLBs to

2857

* old page will be flushed before it can be reused.

2857

* old page will be flushed before it can be reused.

2858

*/

2858

*/

2859

page_remove_rmap(old_page);

2859

page_remove_rmap(old_page);

2860

}

2860

}

2861

2862

/* Free the old page.. */

2862

/* Free the old page.. */

2863

new_page = old_page;

2863

new_page = old_page;

2864

ret |= VM_FAULT_WRITE;

2864

ret |= VM_FAULT_WRITE;

2865

} else

2865

} else

2866

mem_cgroup_uncharge_page(new_page);

2866

mem_cgroup_uncharge_page(new_page);

2867

2868

if (new_page)

2868

if (new_page)

2869

page_cache_release(new_page);

2869

page_cache_release(new_page);

2870

unlock:

2870

unlock:

2871

pte_unmap_unlock(page_table, ptl);

2871

pte_unmap_unlock(page_table, ptl);

2872

if (mmun_end > mmun_start)

2872

if (mmun_end > mmun_start)

2873

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

2873

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

2874

if (old_page) {

2874

if (old_page) {

2875

/*

2875

/*

2876

* Don't let another task, with possibly unlocked vma,

2876

* Don't let another task, with possibly unlocked vma,

2877

* keep the mlocked page.

2877

* keep the mlocked page.

2878

*/

2878

*/

2879

if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {

2879

if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {

2880

lock_page(old_page); /* LRU manipulation */

2880

lock_page(old_page); /* LRU manipulation */

2881

munlock_vma_page(old_page);

2881

munlock_vma_page(old_page);

2882

unlock_page(old_page);

2882

unlock_page(old_page);

2883

}

2883

}

2884

page_cache_release(old_page);

2884

page_cache_release(old_page);

2885

}

2885

}

2886

return ret;

2886

return ret;

2887

oom_free_new:

2887

oom_free_new:

2888

page_cache_release(new_page);

2888

page_cache_release(new_page);

2889

oom:

2889

oom:

2890

if (old_page)

2890

if (old_page)

2891

page_cache_release(old_page);

2891

page_cache_release(old_page);

2892

return VM_FAULT_OOM;

2892

return VM_FAULT_OOM;

2893

2894

unwritable_page:

2894

unwritable_page:

2895

page_cache_release(old_page);

2895

page_cache_release(old_page);

2896

return ret;

2896

return ret;

2897

}

2897

}

2898

2899

static void unmap_mapping_range_vma(struct vm_area_struct *vma,

2899

static void unmap_mapping_range_vma(struct vm_area_struct *vma,

2900

unsigned long start_addr, unsigned long end_addr,

2900

unsigned long start_addr, unsigned long end_addr,

2901

struct zap_details *details)

2901

struct zap_details *details)

2902

{

2902

{

2903

zap_page_range_single(vma, start_addr, end_addr - start_addr, details);

2903

zap_page_range_single(vma, start_addr, end_addr - start_addr, details);

2904

}

2904

}

2905

2906

static inline void unmap_mapping_range_tree(struct rb_root *root,

2906

static inline void unmap_mapping_range_tree(struct rb_root *root,

2907

struct zap_details *details)

2907

struct zap_details *details)

2908

{

2908

{

2909

struct vm_area_struct *vma;

2909

struct vm_area_struct *vma;

2910

pgoff_t vba, vea, zba, zea;

2910

pgoff_t vba, vea, zba, zea;

2911

2912

vma_interval_tree_foreach(vma, root,

2912

vma_interval_tree_foreach(vma, root,

2913

details->first_index, details->last_index) {

2913

details->first_index, details->last_index) {

2914

2915

vba = vma->vm_pgoff;

2915

vba = vma->vm_pgoff;

2916

vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;

2916

vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;

2917

/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */

2917

/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */

2918

zba = details->first_index;

2918

zba = details->first_index;

2919

if (zba < vba)

2919

if (zba < vba)

2920

zba = vba;

2920

zba = vba;

2921

zea = details->last_index;

2921

zea = details->last_index;

2922

if (zea > vea)

2922

if (zea > vea)

2923

zea = vea;

2923

zea = vea;

2924

2925

unmap_mapping_range_vma(vma,

2925

unmap_mapping_range_vma(vma,

2926

((zba - vba) << PAGE_SHIFT) + vma->vm_start,

2926

((zba - vba) << PAGE_SHIFT) + vma->vm_start,

2927

((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,

2927

((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,

2928

details);

2928

details);

2929

}

2929

}

2930

}

2930

}

2931

2932

static inline void unmap_mapping_range_list(struct list_head *head,

2932

static inline void unmap_mapping_range_list(struct list_head *head,

2933

struct zap_details *details)

2933

struct zap_details *details)

2934

{

2934

{

2935

struct vm_area_struct *vma;

2935

struct vm_area_struct *vma;

2936

2937

/*

2937

/*

2938

* In nonlinear VMAs there is no correspondence between virtual address

2938

* In nonlinear VMAs there is no correspondence between virtual address

2939

* offset and file offset. So we must perform an exhaustive search

2939

* offset and file offset. So we must perform an exhaustive search

2940

* across *all* the pages in each nonlinear VMA, not just the pages

2940

* across *all* the pages in each nonlinear VMA, not just the pages

2941

* whose virtual address lies outside the file truncation point.

2941

* whose virtual address lies outside the file truncation point.

2942

*/

2942

*/

2943

list_for_each_entry(vma, head, shared.nonlinear) {

2943

list_for_each_entry(vma, head, shared.nonlinear) {

2944

details->nonlinear_vma = vma;

2944

details->nonlinear_vma = vma;

2945

unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);

2945

unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);

2946

}

2946

}

2947

}

2947

}

2948

2949

/**

2949

/**

2950

* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.

2950

* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.

2951

* @mapping: the address space containing mmaps to be unmapped.

2951

* @mapping: the address space containing mmaps to be unmapped.

2952

* @holebegin: byte in first page to unmap, relative to the start of

2952

* @holebegin: byte in first page to unmap, relative to the start of

2953

* the underlying file. This will be rounded down to a PAGE_SIZE

2953

* the underlying file. This will be rounded down to a PAGE_SIZE

2954

* boundary. Note that this is different from truncate_pagecache(), which

2954

* boundary. Note that this is different from truncate_pagecache(), which

2955

* must keep the partial page. In contrast, we must get rid of

2955

* must keep the partial page. In contrast, we must get rid of

2956

* partial pages.

2956

* partial pages.

2957

* @holelen: size of prospective hole in bytes. This will be rounded

2957

* @holelen: size of prospective hole in bytes. This will be rounded

2958

* up to a PAGE_SIZE boundary. A holelen of zero truncates to the

2958

* up to a PAGE_SIZE boundary. A holelen of zero truncates to the

2959

* end of the file.

2959

* end of the file.

2960

* @even_cows: 1 when truncating a file, unmap even private COWed pages;

2960

* @even_cows: 1 when truncating a file, unmap even private COWed pages;

2961

* but 0 when invalidating pagecache, don't throw away private data.

2961

* but 0 when invalidating pagecache, don't throw away private data.

2962

*/

2962

*/

2963

void unmap_mapping_range(struct address_space *mapping,

2963

void unmap_mapping_range(struct address_space *mapping,

2964

loff_t const holebegin, loff_t const holelen, int even_cows)

2964

loff_t const holebegin, loff_t const holelen, int even_cows)

2965

{

2965

{

2966

struct zap_details details;

2966

struct zap_details details;

2967

pgoff_t hba = holebegin >> PAGE_SHIFT;

2967

pgoff_t hba = holebegin >> PAGE_SHIFT;

2968

pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

2968

pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

2969

2970

/* Check for overflow. */

2970

/* Check for overflow. */

2971

if (sizeof(holelen) > sizeof(hlen)) {

2971

if (sizeof(holelen) > sizeof(hlen)) {

2972

long long holeend =

2972

long long holeend =

2973

(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

2973

(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;

2974

if (holeend & ~(long long)ULONG_MAX)

2974

if (holeend & ~(long long)ULONG_MAX)

2975

hlen = ULONG_MAX - hba + 1;

2975

hlen = ULONG_MAX - hba + 1;

2976

}

2976

}

2977

2978

details.check_mapping = even_cows? NULL: mapping;

2978

details.check_mapping = even_cows? NULL: mapping;

2979

details.nonlinear_vma = NULL;

2979

details.nonlinear_vma = NULL;

2980

details.first_index = hba;

2980

details.first_index = hba;

2981

details.last_index = hba + hlen - 1;

2981

details.last_index = hba + hlen - 1;

2982

if (details.last_index < details.first_index)

2982

if (details.last_index < details.first_index)

2983

details.last_index = ULONG_MAX;

2983

details.last_index = ULONG_MAX;

2984

2985

2986

mutex_lock(&mapping->i_mmap_mutex);

2986

mutex_lock(&mapping->i_mmap_mutex);

2987

if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))

2987

if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))

2988

unmap_mapping_range_tree(&mapping->i_mmap, &details);

2988

unmap_mapping_range_tree(&mapping->i_mmap, &details);

2989

if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))

2989

if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))

2990

unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);

2990

unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);

2991

mutex_unlock(&mapping->i_mmap_mutex);

2991

mutex_unlock(&mapping->i_mmap_mutex);

2992

}

2992

}

2993

EXPORT_SYMBOL(unmap_mapping_range);

2993

EXPORT_SYMBOL(unmap_mapping_range);

2994

2995

/*

2995

/*

2996

* We enter with non-exclusive mmap_sem (to exclude vma changes,

2996

* We enter with non-exclusive mmap_sem (to exclude vma changes,

2997

* but allow concurrent faults), and pte mapped but not yet locked.

2997

* but allow concurrent faults), and pte mapped but not yet locked.

2998

* We return with mmap_sem still held, but pte unmapped and unlocked.

2998

* We return with mmap_sem still held, but pte unmapped and unlocked.

2999

*/

2999

*/

3000

static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,

3000

static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,

3001

unsigned long address, pte_t *page_table, pmd_t *pmd,

3001

unsigned long address, pte_t *page_table, pmd_t *pmd,

3002

unsigned int flags, pte_t orig_pte)

3002

unsigned int flags, pte_t orig_pte)

3003

{

3003

{

3004

spinlock_t *ptl;

3004

spinlock_t *ptl;

3005

struct page *page, *swapcache;

3005

struct page *page, *swapcache;

3006

swp_entry_t entry;

3006

swp_entry_t entry;

3007

pte_t pte;

3007

pte_t pte;

3008

int locked;

3008

int locked;

3009

struct mem_cgroup *ptr;

3009

struct mem_cgroup *ptr;

3010

int exclusive = 0;

3010

int exclusive = 0;

3011

int ret = 0;

3011

int ret = 0;

3012

3013

if (!pte_unmap_same(mm, pmd, page_table, orig_pte))

3013

if (!pte_unmap_same(mm, pmd, page_table, orig_pte))

3014

goto out;

3014

goto out;

3015

3016

entry = pte_to_swp_entry(orig_pte);

3016

entry = pte_to_swp_entry(orig_pte);

3017

if (unlikely(non_swap_entry(entry))) {

3017

if (unlikely(non_swap_entry(entry))) {

3018

if (is_migration_entry(entry)) {

3018

if (is_migration_entry(entry)) {

3019

migration_entry_wait(mm, pmd, address);

3019

migration_entry_wait(mm, pmd, address);

3020

} else if (is_hwpoison_entry(entry)) {

3020

} else if (is_hwpoison_entry(entry)) {

3021

ret = VM_FAULT_HWPOISON;

3021

ret = VM_FAULT_HWPOISON;

3022

} else {

3022

} else {

3023

print_bad_pte(vma, address, orig_pte, NULL);

3023

print_bad_pte(vma, address, orig_pte, NULL);

3024

ret = VM_FAULT_SIGBUS;

3024

ret = VM_FAULT_SIGBUS;

3025

}

3025

}

3026

goto out;

3026

goto out;

3027

}

3027

}

3028

delayacct_set_flag(DELAYACCT_PF_SWAPIN);

3028

delayacct_set_flag(DELAYACCT_PF_SWAPIN);

3029

page = lookup_swap_cache(entry);

3029

page = lookup_swap_cache(entry);

3030

if (!page) {

3030

if (!page) {

3031

page = swapin_readahead(entry,

3031

page = swapin_readahead(entry,

3032

GFP_HIGHUSER_MOVABLE, vma, address);

3032

GFP_HIGHUSER_MOVABLE, vma, address);

3033

if (!page) {

3033

if (!page) {

3034

/*

3034

/*

3035

* Back out if somebody else faulted in this pte

3035

* Back out if somebody else faulted in this pte

3036

* while we released the pte lock.

3036

* while we released the pte lock.

3037

*/

3037

*/

3038

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3038

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3039

if (likely(pte_same(*page_table, orig_pte)))

3039

if (likely(pte_same(*page_table, orig_pte)))

3040

ret = VM_FAULT_OOM;

3040

ret = VM_FAULT_OOM;

3041

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3041

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3042

goto unlock;

3042

goto unlock;

3043

}

3043

}

3044

3045

/* Had to read the page from swap area: Major fault */

3045

/* Had to read the page from swap area: Major fault */

3046

ret = VM_FAULT_MAJOR;

3046

ret = VM_FAULT_MAJOR;

3047

count_vm_event(PGMAJFAULT);

3047

count_vm_event(PGMAJFAULT);

3048

mem_cgroup_count_vm_event(mm, PGMAJFAULT);

3048

mem_cgroup_count_vm_event(mm, PGMAJFAULT);

3049

} else if (PageHWPoison(page)) {

3049

} else if (PageHWPoison(page)) {

3050

/*

3050

/*

3051

* hwpoisoned dirty swapcache pages are kept for killing

3051

* hwpoisoned dirty swapcache pages are kept for killing

3052

* owner processes (which may be unknown at hwpoison time)

3052

* owner processes (which may be unknown at hwpoison time)

3053

*/

3053

*/

3054

ret = VM_FAULT_HWPOISON;

3054

ret = VM_FAULT_HWPOISON;

3055

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3055

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3056

swapcache = page;

3056

swapcache = page;

3057

goto out_release;

3057

goto out_release;

3058

}

3058

}

3059

3060

swapcache = page;

3060

swapcache = page;

3061

locked = lock_page_or_retry(page, mm, flags);

3061

locked = lock_page_or_retry(page, mm, flags);

3062

3063

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3063

delayacct_clear_flag(DELAYACCT_PF_SWAPIN);

3064

if (!locked) {

3064

if (!locked) {

3065

ret |= VM_FAULT_RETRY;

3065

ret |= VM_FAULT_RETRY;

3066

goto out_release;

3066

goto out_release;

3067

}

3067

}

3068

3069

/*

3069

/*

3070

* Make sure try_to_free_swap or reuse_swap_page or swapoff did not

3070

* Make sure try_to_free_swap or reuse_swap_page or swapoff did not

3071

* release the swapcache from under us. The page pin, and pte_same

3071

* release the swapcache from under us. The page pin, and pte_same

3072

* test below, are not enough to exclude that. Even if it is still

3072

* test below, are not enough to exclude that. Even if it is still

3073

* swapcache, we need to check that the page's swap has not changed.

3073

* swapcache, we need to check that the page's swap has not changed.

3074

*/

3074

*/

3075

if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))

3075

if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))

3076

goto out_page;

3076

goto out_page;

3077

3078

page = ksm_might_need_to_copy(page, vma, address);

3078

page = ksm_might_need_to_copy(page, vma, address);

3079

if (unlikely(!page)) {

3079

if (unlikely(!page)) {

3080

ret = VM_FAULT_OOM;

3080

ret = VM_FAULT_OOM;

3081

page = swapcache;

3081

page = swapcache;

3082

goto out_page;

3082

goto out_page;

3083

}

3083

}

3084

3085

if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {

3085

if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {

3086

ret = VM_FAULT_OOM;

3086

ret = VM_FAULT_OOM;

3087

goto out_page;

3087

goto out_page;

3088

}

3088

}

3089

3090

/*

3090

/*

3091

* Back out if somebody else already faulted in this pte.

3091

* Back out if somebody else already faulted in this pte.

3092

*/

3092

*/

3093

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3093

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3094

if (unlikely(!pte_same(*page_table, orig_pte)))

3094

if (unlikely(!pte_same(*page_table, orig_pte)))

3095

goto out_nomap;

3095

goto out_nomap;

3096

3097

if (unlikely(!PageUptodate(page))) {

3097

if (unlikely(!PageUptodate(page))) {

3098

ret = VM_FAULT_SIGBUS;

3098

ret = VM_FAULT_SIGBUS;

3099

goto out_nomap;

3099

goto out_nomap;

3100

}

3100

}

3101

3102

/*

3102

/*

3103

* The page isn't present yet, go ahead with the fault.

3103

* The page isn't present yet, go ahead with the fault.

3104

*

3104

*

3105

* Be careful about the sequence of operations here.

3105

* Be careful about the sequence of operations here.

3106

* To get its accounting right, reuse_swap_page() must be called

3106

* To get its accounting right, reuse_swap_page() must be called

3107

* while the page is counted on swap but not yet in mapcount i.e.

3107

* while the page is counted on swap but not yet in mapcount i.e.

3108

* before page_add_anon_rmap() and swap_free(); try_to_free_swap()

3108

* before page_add_anon_rmap() and swap_free(); try_to_free_swap()

3109

* must be called after the swap_free(), or it will never succeed.

3109

* must be called after the swap_free(), or it will never succeed.

3110

* Because delete_from_swap_page() may be called by reuse_swap_page(),

3110

* Because delete_from_swap_page() may be called by reuse_swap_page(),

3111

* mem_cgroup_commit_charge_swapin() may not be able to find swp_entry

3111

* mem_cgroup_commit_charge_swapin() may not be able to find swp_entry

3112

* in page->private. In this case, a record in swap_cgroup is silently

3112

* in page->private. In this case, a record in swap_cgroup is silently

3113

* discarded at swap_free().

3113

* discarded at swap_free().

3114

*/

3114

*/

3115

3116

inc_mm_counter_fast(mm, MM_ANONPAGES);

3116

inc_mm_counter_fast(mm, MM_ANONPAGES);

3117

dec_mm_counter_fast(mm, MM_SWAPENTS);

3117

dec_mm_counter_fast(mm, MM_SWAPENTS);

3118

pte = mk_pte(page, vma->vm_page_prot);

3118

pte = mk_pte(page, vma->vm_page_prot);

3119

if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {

3119

if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {

3120

pte = maybe_mkwrite(pte_mkdirty(pte), vma);

3120

pte = maybe_mkwrite(pte_mkdirty(pte), vma);

3121

flags &= ~FAULT_FLAG_WRITE;

3121

flags &= ~FAULT_FLAG_WRITE;

3122

ret |= VM_FAULT_WRITE;

3122

ret |= VM_FAULT_WRITE;

3123

exclusive = 1;

3123

exclusive = 1;

3124

}

3124

}

3125

flush_icache_page(vma, page);

3125

flush_icache_page(vma, page);

3126

set_pte_at(mm, address, page_table, pte);

3126

set_pte_at(mm, address, page_table, pte);

3127

if (page == swapcache)

3127

if (page == swapcache)

3128

do_page_add_anon_rmap(page, vma, address, exclusive);

3128

do_page_add_anon_rmap(page, vma, address, exclusive);

3129

else /* ksm created a completely new copy */

3129

else /* ksm created a completely new copy */

3130

page_add_new_anon_rmap(page, vma, address);

3130

page_add_new_anon_rmap(page, vma, address);

3131

/* It's better to call commit-charge after rmap is established */

3131

/* It's better to call commit-charge after rmap is established */

3132

mem_cgroup_commit_charge_swapin(page, ptr);

3132

mem_cgroup_commit_charge_swapin(page, ptr);

3133

3134

swap_free(entry);

3134

swap_free(entry);

3135

if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))

3135

if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))

3136

try_to_free_swap(page);

3136

try_to_free_swap(page);

3137

unlock_page(page);

3137

unlock_page(page);

3138

if (page != swapcache) {

3138

if (page != swapcache) {

3139

/*

3139

/*

3140

* Hold the lock to avoid the swap entry to be reused

3140

* Hold the lock to avoid the swap entry to be reused

3141

* until we take the PT lock for the pte_same() check

3141

* until we take the PT lock for the pte_same() check

3142

* (to avoid false positives from pte_same). For

3142

* (to avoid false positives from pte_same). For

3143

* further safety release the lock after the swap_free

3143

* further safety release the lock after the swap_free

3144

* so that the swap count won't change under a

3144

* so that the swap count won't change under a

3145

* parallel locked swapcache.

3145

* parallel locked swapcache.

3146

*/

3146

*/

3147

unlock_page(swapcache);

3147

unlock_page(swapcache);

3148

page_cache_release(swapcache);

3148

page_cache_release(swapcache);

3149

}

3149

}

3150

3151

if (flags & FAULT_FLAG_WRITE) {

3151

if (flags & FAULT_FLAG_WRITE) {

3152

ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);

3152

ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);

3153

if (ret & VM_FAULT_ERROR)

3153

if (ret & VM_FAULT_ERROR)

3154

ret &= VM_FAULT_ERROR;

3154

ret &= VM_FAULT_ERROR;

3155

goto out;

3155

goto out;

3156

}

3156

}

3157

3158

/* No need to invalidate - it was non-present before */

3158

/* No need to invalidate - it was non-present before */

3159

update_mmu_cache(vma, address, page_table);

3159

update_mmu_cache(vma, address, page_table);

3160

unlock:

3160

unlock:

3161

pte_unmap_unlock(page_table, ptl);

3161

pte_unmap_unlock(page_table, ptl);

3162

out:

3162

out:

3163

return ret;

3163

return ret;

3164

out_nomap:

3164

out_nomap:

3165

mem_cgroup_cancel_charge_swapin(ptr);

3165

mem_cgroup_cancel_charge_swapin(ptr);

3166

pte_unmap_unlock(page_table, ptl);

3166

pte_unmap_unlock(page_table, ptl);

3167

out_page:

3167

out_page:

3168

unlock_page(page);

3168

unlock_page(page);

3169

out_release:

3169

out_release:

3170

page_cache_release(page);

3170

page_cache_release(page);

3171

if (page != swapcache) {

3171

if (page != swapcache) {

3172

unlock_page(swapcache);

3172

unlock_page(swapcache);

3173

page_cache_release(swapcache);

3173

page_cache_release(swapcache);

3174

}

3174

}

3175

return ret;

3175

return ret;

3176

}

3176

}

3177

3178

/*

3178

/*

3179

* This is like a special single-page "expand_{down|up}wards()",

3179

* This is like a special single-page "expand_{down|up}wards()",

3180

* except we must first make sure that 'address{-|+}PAGE_SIZE'

3180

* except we must first make sure that 'address{-|+}PAGE_SIZE'

3181

* doesn't hit another vma.

3181

* doesn't hit another vma.

3182

*/

3182

*/

3183

static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)

3183

static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)

3184

{

3184

{

3185

address &= PAGE_MASK;

3185

address &= PAGE_MASK;

3186

if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {

3186

if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {

3187

struct vm_area_struct *prev = vma->vm_prev;

3187

struct vm_area_struct *prev = vma->vm_prev;

3188

3189

/*

3189

/*

3190

* Is there a mapping abutting this one below?

3190

* Is there a mapping abutting this one below?

3191

*

3191

*

3192

* That's only ok if it's the same stack mapping

3192

* That's only ok if it's the same stack mapping

3193

* that has gotten split..

3193

* that has gotten split..

3194

*/

3194

*/

3195

if (prev && prev->vm_end == address)

3195

if (prev && prev->vm_end == address)

3196

return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;

3196

return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;

3197

3198

expand_downwards(vma, address - PAGE_SIZE);

3198

expand_downwards(vma, address - PAGE_SIZE);

3199

}

3199

}

3200

if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {

3200

if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {

3201

struct vm_area_struct *next = vma->vm_next;

3201

struct vm_area_struct *next = vma->vm_next;

3202

3203

/* As VM_GROWSDOWN but s/below/above/ */

3203

/* As VM_GROWSDOWN but s/below/above/ */

3204

if (next && next->vm_start == address + PAGE_SIZE)

3204

if (next && next->vm_start == address + PAGE_SIZE)

3205

return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;

3205

return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;

3206

3207

expand_upwards(vma, address + PAGE_SIZE);

3207

expand_upwards(vma, address + PAGE_SIZE);

3208

}

3208

}

3209

return 0;

3209

return 0;

3210

}

3210

}

3211

3212

/*

3212

/*

3213

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3213

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3214

* but allow concurrent faults), and pte mapped but not yet locked.

3214

* but allow concurrent faults), and pte mapped but not yet locked.

3215

* We return with mmap_sem still held, but pte unmapped and unlocked.

3215

* We return with mmap_sem still held, but pte unmapped and unlocked.

3216

*/

3216

*/

3217

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,

3217

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,

3218

unsigned long address, pte_t *page_table, pmd_t *pmd,

3218

unsigned long address, pte_t *page_table, pmd_t *pmd,

3219

unsigned int flags)

3219

unsigned int flags)

3220

{

3220

{

3221

struct page *page;

3221

struct page *page;

3222

spinlock_t *ptl;

3222

spinlock_t *ptl;

3223

pte_t entry;

3223

pte_t entry;

3224

3225

pte_unmap(page_table);

3225

pte_unmap(page_table);

3226

3227

/* Check if we need to add a guard page to the stack */

3227

/* Check if we need to add a guard page to the stack */

3228

if (check_stack_guard_page(vma, address) < 0)

3228

if (check_stack_guard_page(vma, address) < 0)

3229

return VM_FAULT_SIGBUS;

3229

return VM_FAULT_SIGBUS;

3230

3231

/* Use the zero-page for reads */

3231

/* Use the zero-page for reads */

3232

if (!(flags & FAULT_FLAG_WRITE)) {

3232

if (!(flags & FAULT_FLAG_WRITE)) {

3233

entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),

3233

entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),

3234

vma->vm_page_prot));

3234

vma->vm_page_prot));

3235

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3235

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3236

if (!pte_none(*page_table))

3236

if (!pte_none(*page_table))

3237

goto unlock;

3237

goto unlock;

3238

goto setpte;

3238

goto setpte;

3239

}

3239

}

3240

3241

/* Allocate our own private page. */

3241

/* Allocate our own private page. */

3242

if (unlikely(anon_vma_prepare(vma)))

3242

if (unlikely(anon_vma_prepare(vma)))

3243

goto oom;

3243

goto oom;

3244

page = alloc_zeroed_user_highpage_movable(vma, address);

3244

page = alloc_zeroed_user_highpage_movable(vma, address);

3245

if (!page)

3245

if (!page)

3246

goto oom;

3246

goto oom;

3247

/*

3248

* The memory barrier inside __SetPageUptodate makes sure that

3249

* preceeding stores to the page contents become visible before

3250

* the set_pte_at() write.

3251

*/

3247

__SetPageUptodate(page);

3252

__SetPageUptodate(page);

3248

3253

3249

if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))

3254

if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))

3250

goto oom_free_page;

3255

goto oom_free_page;

3251

3256

3252

entry = mk_pte(page, vma->vm_page_prot);

3257

entry = mk_pte(page, vma->vm_page_prot);

3253

if (vma->vm_flags & VM_WRITE)

3258

if (vma->vm_flags & VM_WRITE)

3254

entry = pte_mkwrite(pte_mkdirty(entry));

3259

entry = pte_mkwrite(pte_mkdirty(entry));

3255

3260

3256

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3261

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3257

if (!pte_none(*page_table))

3262

if (!pte_none(*page_table))

3258

goto release;

3263

goto release;

3259

3264

3260

inc_mm_counter_fast(mm, MM_ANONPAGES);

3265

inc_mm_counter_fast(mm, MM_ANONPAGES);

3261

page_add_new_anon_rmap(page, vma, address);

3266

page_add_new_anon_rmap(page, vma, address);

3262

setpte:

3267

setpte:

3263

set_pte_at(mm, address, page_table, entry);

3268

set_pte_at(mm, address, page_table, entry);

3264

3269

3265

/* No need to invalidate - it was non-present before */

3270

/* No need to invalidate - it was non-present before */

3266

update_mmu_cache(vma, address, page_table);

3271

update_mmu_cache(vma, address, page_table);

3267

unlock:

3272

unlock:

3268

pte_unmap_unlock(page_table, ptl);

3273

pte_unmap_unlock(page_table, ptl);

3269

return 0;

3274

return 0;

3270

release:

3275

release:

3271

mem_cgroup_uncharge_page(page);

3276

mem_cgroup_uncharge_page(page);

3272

page_cache_release(page);

3277

page_cache_release(page);

3273

goto unlock;

3278

goto unlock;

3274

oom_free_page:

3279

oom_free_page:

3275

page_cache_release(page);

3280

page_cache_release(page);

3276

oom:

3281

oom:

3277

return VM_FAULT_OOM;

3282

return VM_FAULT_OOM;

3278

}

3283

}

3279

3284

3280

/*

3285

/*

3281

* __do_fault() tries to create a new page mapping. It aggressively

3286

* __do_fault() tries to create a new page mapping. It aggressively

3282

* tries to share with existing pages, but makes a separate copy if

3287

* tries to share with existing pages, but makes a separate copy if

3283

* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid

3288

* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid

3284

* the next page fault.

3289

* the next page fault.

3285

*

3290

*

3286

* As this is called only for pages that do not currently exist, we

3291

* As this is called only for pages that do not currently exist, we

3287

* do not need to flush old virtual caches or the TLB.

3292

* do not need to flush old virtual caches or the TLB.

3288

*

3293

*

3289

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3294

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3290

* but allow concurrent faults), and pte neither mapped nor locked.

3295

* but allow concurrent faults), and pte neither mapped nor locked.

3291

* We return with mmap_sem still held, but pte unmapped and unlocked.

3296

* We return with mmap_sem still held, but pte unmapped and unlocked.

3292

*/

3297

*/

3293

static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3298

static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3294

unsigned long address, pmd_t *pmd,

3299

unsigned long address, pmd_t *pmd,

3295

pgoff_t pgoff, unsigned int flags, pte_t orig_pte)

3300

pgoff_t pgoff, unsigned int flags, pte_t orig_pte)

3296

{

3301

{

3297

pte_t *page_table;

3302

pte_t *page_table;

3298

spinlock_t *ptl;

3303

spinlock_t *ptl;

3299

struct page *page;

3304

struct page *page;

3300

struct page *cow_page;

3305

struct page *cow_page;

3301

pte_t entry;

3306

pte_t entry;

3302

int anon = 0;

3307

int anon = 0;

3303

struct page *dirty_page = NULL;

3308

struct page *dirty_page = NULL;

3304

struct vm_fault vmf;

3309

struct vm_fault vmf;

3305

int ret;

3310

int ret;

3306

int page_mkwrite = 0;

3311

int page_mkwrite = 0;

3307

3312

3308

/*

3313

/*

3309

* If we do COW later, allocate page befor taking lock_page()

3314

* If we do COW later, allocate page befor taking lock_page()

3310

* on the file cache page. This will reduce lock holding time.

3315

* on the file cache page. This will reduce lock holding time.

3311

*/

3316

*/

3312

if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {

3317

if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {

3313

3318

3314

if (unlikely(anon_vma_prepare(vma)))

3319

if (unlikely(anon_vma_prepare(vma)))

3315

return VM_FAULT_OOM;

3320

return VM_FAULT_OOM;

3316

3321

3317

cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

3322

cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

3318

if (!cow_page)

3323

if (!cow_page)

3319

return VM_FAULT_OOM;

3324

return VM_FAULT_OOM;

3320

3325

3321

if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {

3326

if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {

3322

page_cache_release(cow_page);

3327

page_cache_release(cow_page);

3323

return VM_FAULT_OOM;

3328

return VM_FAULT_OOM;

3324

}

3329

}

3325

} else

3330

} else

3326

cow_page = NULL;

3331

cow_page = NULL;

3327

3332

3328

vmf.virtual_address = (void __user *)(address & PAGE_MASK);

3333

vmf.virtual_address = (void __user *)(address & PAGE_MASK);

3329

vmf.pgoff = pgoff;

3334

vmf.pgoff = pgoff;

3330

vmf.flags = flags;

3335

vmf.flags = flags;

3331

vmf.page = NULL;

3336

vmf.page = NULL;

3332

3337

3333

ret = vma->vm_ops->fault(vma, &vmf);

3338

ret = vma->vm_ops->fault(vma, &vmf);

3334

if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |

3339

if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |

3335

VM_FAULT_RETRY)))

3340

VM_FAULT_RETRY)))

3336

goto uncharge_out;

3341

goto uncharge_out;

3337

3342

3338

if (unlikely(PageHWPoison(vmf.page))) {

3343

if (unlikely(PageHWPoison(vmf.page))) {

3339

if (ret & VM_FAULT_LOCKED)

3344

if (ret & VM_FAULT_LOCKED)

3340

unlock_page(vmf.page);

3345

unlock_page(vmf.page);

3341

ret = VM_FAULT_HWPOISON;

3346

ret = VM_FAULT_HWPOISON;

3342

goto uncharge_out;

3347

goto uncharge_out;

3343

}

3348

}

3344

3349

3345

/*

3350

/*

3346

* For consistency in subsequent calls, make the faulted page always

3351

* For consistency in subsequent calls, make the faulted page always

3347

* locked.

3352

* locked.

3348

*/

3353

*/

3349

if (unlikely(!(ret & VM_FAULT_LOCKED)))

3354

if (unlikely(!(ret & VM_FAULT_LOCKED)))

3350

lock_page(vmf.page);

3355

lock_page(vmf.page);

3351

else

3356

else

3352

VM_BUG_ON(!PageLocked(vmf.page));

3357

VM_BUG_ON(!PageLocked(vmf.page));

3353

3358

3354

/*

3359

/*

3355

* Should we do an early C-O-W break?

3360

* Should we do an early C-O-W break?

3356

*/

3361

*/

3357

page = vmf.page;

3362

page = vmf.page;

3358

if (flags & FAULT_FLAG_WRITE) {

3363

if (flags & FAULT_FLAG_WRITE) {

3359

if (!(vma->vm_flags & VM_SHARED)) {

3364

if (!(vma->vm_flags & VM_SHARED)) {

3360

page = cow_page;

3365

page = cow_page;

3361

anon = 1;

3366

anon = 1;

3362

copy_user_highpage(page, vmf.page, address, vma);

3367

copy_user_highpage(page, vmf.page, address, vma);

3363

__SetPageUptodate(page);

3368

__SetPageUptodate(page);

3364

} else {

3369

} else {

3365

/*

3370

/*

3366

* If the page will be shareable, see if the backing

3371

* If the page will be shareable, see if the backing

3367

* address space wants to know that the page is about

3372

* address space wants to know that the page is about

3368

* to become writable

3373

* to become writable

3369

*/

3374

*/

3370

if (vma->vm_ops->page_mkwrite) {

3375

if (vma->vm_ops->page_mkwrite) {

3371

int tmp;

3376

int tmp;

3372

3377

3373

unlock_page(page);

3378

unlock_page(page);

3374

vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

3379

vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

3375

tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

3380

tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

3376

if (unlikely(tmp &

3381

if (unlikely(tmp &

3377

(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

3382

(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {

3378

ret = tmp;

3383

ret = tmp;

3379

goto unwritable_page;

3384

goto unwritable_page;

3380

}

3385

}

3381

if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

3386

if (unlikely(!(tmp & VM_FAULT_LOCKED))) {

3382

lock_page(page);

3387

lock_page(page);

3383

if (!page->mapping) {

3388

if (!page->mapping) {

3384

ret = 0; /* retry the fault */

3389

ret = 0; /* retry the fault */

3385

unlock_page(page);

3390

unlock_page(page);

3386

goto unwritable_page;

3391

goto unwritable_page;

3387

}

3392

}

3388

} else

3393

} else

3389

VM_BUG_ON(!PageLocked(page));

3394

VM_BUG_ON(!PageLocked(page));

3390

page_mkwrite = 1;

3395

page_mkwrite = 1;

3391

}

3396

}

3392

}

3397

}

3393

3398

3394

}

3399

}

3395

3400

3396

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3401

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

3397

3402

3398

/*

3403

/*

3399

* This silly early PAGE_DIRTY setting removes a race

3404

* This silly early PAGE_DIRTY setting removes a race

3400

* due to the bad i386 page protection. But it's valid

3405

* due to the bad i386 page protection. But it's valid

3401

* for other architectures too.

3406

* for other architectures too.

3402

*

3407

*

3403

* Note that if FAULT_FLAG_WRITE is set, we either now have

3408

* Note that if FAULT_FLAG_WRITE is set, we either now have

3404

* an exclusive copy of the page, or this is a shared mapping,

3409

* an exclusive copy of the page, or this is a shared mapping,

3405

* so we can make it writable and dirty to avoid having to

3410

* so we can make it writable and dirty to avoid having to

3406

* handle that later.

3411

* handle that later.

3407

*/

3412

*/

3408

/* Only go through if we didn't race with anybody else... */

3413

/* Only go through if we didn't race with anybody else... */

3409

if (likely(pte_same(*page_table, orig_pte))) {

3414

if (likely(pte_same(*page_table, orig_pte))) {

3410

flush_icache_page(vma, page);

3415

flush_icache_page(vma, page);

3411

entry = mk_pte(page, vma->vm_page_prot);

3416

entry = mk_pte(page, vma->vm_page_prot);

3412

if (flags & FAULT_FLAG_WRITE)

3417

if (flags & FAULT_FLAG_WRITE)

3413

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

3418

entry = maybe_mkwrite(pte_mkdirty(entry), vma);

3414

if (anon) {

3419

if (anon) {

3415

inc_mm_counter_fast(mm, MM_ANONPAGES);

3420

inc_mm_counter_fast(mm, MM_ANONPAGES);

3416

page_add_new_anon_rmap(page, vma, address);

3421

page_add_new_anon_rmap(page, vma, address);

3417

} else {

3422

} else {

3418

inc_mm_counter_fast(mm, MM_FILEPAGES);

3423

inc_mm_counter_fast(mm, MM_FILEPAGES);

3419

page_add_file_rmap(page);

3424

page_add_file_rmap(page);

3420

if (flags & FAULT_FLAG_WRITE) {

3425

if (flags & FAULT_FLAG_WRITE) {

3421

dirty_page = page;

3426

dirty_page = page;

3422

get_page(dirty_page);

3427

get_page(dirty_page);

3423

}

3428

}

3424

}

3429

}

3425

set_pte_at(mm, address, page_table, entry);

3430

set_pte_at(mm, address, page_table, entry);

3426

3431

3427

/* no need to invalidate: a not-present page won't be cached */

3432

/* no need to invalidate: a not-present page won't be cached */

3428

update_mmu_cache(vma, address, page_table);

3433

update_mmu_cache(vma, address, page_table);

3429

} else {

3434

} else {

3430

if (cow_page)

3435

if (cow_page)

3431

mem_cgroup_uncharge_page(cow_page);

3436

mem_cgroup_uncharge_page(cow_page);

3432

if (anon)

3437

if (anon)

3433

page_cache_release(page);

3438

page_cache_release(page);

3434

else

3439

else

3435

anon = 1; /* no anon but release faulted_page */

3440

anon = 1; /* no anon but release faulted_page */

3436

}

3441

}

3437

3442

3438

pte_unmap_unlock(page_table, ptl);

3443

pte_unmap_unlock(page_table, ptl);

3439

3444

3440

if (dirty_page) {

3445

if (dirty_page) {

3441

struct address_space *mapping = page->mapping;

3446

struct address_space *mapping = page->mapping;

3442

int dirtied = 0;

3447

int dirtied = 0;

3443

3448

3444

if (set_page_dirty(dirty_page))

3449

if (set_page_dirty(dirty_page))

3445

dirtied = 1;

3450

dirtied = 1;

3446

unlock_page(dirty_page);

3451

unlock_page(dirty_page);

3447

put_page(dirty_page);

3452

put_page(dirty_page);

3448

if ((dirtied || page_mkwrite) && mapping) {

3453

if ((dirtied || page_mkwrite) && mapping) {

3449

/*

3454

/*

3450

* Some device drivers do not set page.mapping but still

3455

* Some device drivers do not set page.mapping but still

3451

* dirty their pages

3456

* dirty their pages

3452

*/

3457

*/

3453

balance_dirty_pages_ratelimited(mapping);

3458

balance_dirty_pages_ratelimited(mapping);

3454

}

3459

}

3455

3460

3456

/* file_update_time outside page_lock */

3461

/* file_update_time outside page_lock */

3457

if (vma->vm_file && !page_mkwrite)

3462

if (vma->vm_file && !page_mkwrite)

3458

file_update_time(vma->vm_file);

3463

file_update_time(vma->vm_file);

3459

} else {

3464

} else {

3460

unlock_page(vmf.page);

3465

unlock_page(vmf.page);

3461

if (anon)

3466

if (anon)

3462

page_cache_release(vmf.page);

3467

page_cache_release(vmf.page);

3463

}

3468

}

3464

3469

3465

return ret;

3470

return ret;

3466

3471

3467

unwritable_page:

3472

unwritable_page:

3468

page_cache_release(page);

3473

page_cache_release(page);

3469

return ret;

3474

return ret;

3470

uncharge_out:

3475

uncharge_out:

3471

/* fs's fault handler get error */

3476

/* fs's fault handler get error */

3472

if (cow_page) {

3477

if (cow_page) {

3473

mem_cgroup_uncharge_page(cow_page);

3478

mem_cgroup_uncharge_page(cow_page);

3474

page_cache_release(cow_page);

3479

page_cache_release(cow_page);

3475

}

3480

}

3476

return ret;

3481

return ret;

3477

}

3482

}

3478

3483

3479

static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3484

static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3480

unsigned long address, pte_t *page_table, pmd_t *pmd,

3485

unsigned long address, pte_t *page_table, pmd_t *pmd,

3481

unsigned int flags, pte_t orig_pte)

3486

unsigned int flags, pte_t orig_pte)

3482

{

3487

{

3483

pgoff_t pgoff = (((address & PAGE_MASK)

3488

pgoff_t pgoff = (((address & PAGE_MASK)

3484

- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

3489

- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

3485

3490

3486

pte_unmap(page_table);

3491

pte_unmap(page_table);

3487

return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);

3492

return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);

3488

}

3493

}

3489

3494

3490

/*

3495

/*

3491

* Fault of a previously existing named mapping. Repopulate the pte

3496

* Fault of a previously existing named mapping. Repopulate the pte

3492

* from the encoded file_pte if possible. This enables swappable

3497

* from the encoded file_pte if possible. This enables swappable

3493

* nonlinear vmas.

3498

* nonlinear vmas.

3494

*

3499

*

3495

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3500

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3496

* but allow concurrent faults), and pte mapped but not yet locked.

3501

* but allow concurrent faults), and pte mapped but not yet locked.

3497

* We return with mmap_sem still held, but pte unmapped and unlocked.

3502

* We return with mmap_sem still held, but pte unmapped and unlocked.

3498

*/

3503

*/

3499

static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3504

static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3500

unsigned long address, pte_t *page_table, pmd_t *pmd,

3505

unsigned long address, pte_t *page_table, pmd_t *pmd,

3501

unsigned int flags, pte_t orig_pte)

3506

unsigned int flags, pte_t orig_pte)

3502

{

3507

{

3503

pgoff_t pgoff;

3508

pgoff_t pgoff;

3504

3509

3505

flags |= FAULT_FLAG_NONLINEAR;

3510

flags |= FAULT_FLAG_NONLINEAR;

3506

3511

3507

if (!pte_unmap_same(mm, pmd, page_table, orig_pte))

3512

if (!pte_unmap_same(mm, pmd, page_table, orig_pte))

3508

return 0;

3513

return 0;

3509

3514

3510

if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {

3515

if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {

3511

/*

3516

/*

3512

* Page table corrupted: show pte and kill process.

3517

* Page table corrupted: show pte and kill process.

3513

*/

3518

*/

3514

print_bad_pte(vma, address, orig_pte, NULL);

3519

print_bad_pte(vma, address, orig_pte, NULL);

3515

return VM_FAULT_SIGBUS;

3520

return VM_FAULT_SIGBUS;

3516

}

3521

}

3517

3522

3518

pgoff = pte_to_pgoff(orig_pte);

3523

pgoff = pte_to_pgoff(orig_pte);

3519

return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);

3524

return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);

3520

}

3525

}

3521

3526

3522

int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,

3527

int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,

3523

unsigned long addr, int current_nid)

3528

unsigned long addr, int current_nid)

3524

{

3529

{

3525

get_page(page);

3530

get_page(page);

3526

3531

3527

count_vm_numa_event(NUMA_HINT_FAULTS);

3532

count_vm_numa_event(NUMA_HINT_FAULTS);

3528

if (current_nid == numa_node_id())

3533

if (current_nid == numa_node_id())

3529

count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

3534

count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

3530

3535

3531

return mpol_misplaced(page, vma, addr);

3536

return mpol_misplaced(page, vma, addr);

3532

}

3537

}

3533

3538

3534

int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3539

int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3535

unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)

3540

unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)

3536

{

3541

{

3537

struct page *page = NULL;

3542

struct page *page = NULL;

3538

spinlock_t *ptl;

3543

spinlock_t *ptl;

3539

int current_nid = -1;

3544

int current_nid = -1;

3540

int target_nid;

3545

int target_nid;

3541

bool migrated = false;

3546

bool migrated = false;

3542

3547

3543

/*

3548

/*

3544

* The "pte" at this point cannot be used safely without

3549

* The "pte" at this point cannot be used safely without

3545

* validation through pte_unmap_same(). It's of NUMA type but

3550

* validation through pte_unmap_same(). It's of NUMA type but

3546

* the pfn may be screwed if the read is non atomic.

3551

* the pfn may be screwed if the read is non atomic.

3547

*

3552

*

3548

* ptep_modify_prot_start is not called as this is clearing

3553

* ptep_modify_prot_start is not called as this is clearing

3549

* the _PAGE_NUMA bit and it is not really expected that there

3554

* the _PAGE_NUMA bit and it is not really expected that there

3550

* would be concurrent hardware modifications to the PTE.

3555

* would be concurrent hardware modifications to the PTE.

3551

*/

3556

*/

3552

ptl = pte_lockptr(mm, pmd);

3557

ptl = pte_lockptr(mm, pmd);

3553

spin_lock(ptl);

3558

spin_lock(ptl);

3554

if (unlikely(!pte_same(*ptep, pte))) {

3559

if (unlikely(!pte_same(*ptep, pte))) {

3555

pte_unmap_unlock(ptep, ptl);

3560

pte_unmap_unlock(ptep, ptl);

3556

goto out;

3561

goto out;

3557

}

3562

}

3558

3563

3559

pte = pte_mknonnuma(pte);

3564

pte = pte_mknonnuma(pte);

3560

set_pte_at(mm, addr, ptep, pte);

3565

set_pte_at(mm, addr, ptep, pte);

3561

update_mmu_cache(vma, addr, ptep);

3566

update_mmu_cache(vma, addr, ptep);

3562

3567

3563

page = vm_normal_page(vma, addr, pte);

3568

page = vm_normal_page(vma, addr, pte);

3564

if (!page) {

3569

if (!page) {

3565

pte_unmap_unlock(ptep, ptl);

3570

pte_unmap_unlock(ptep, ptl);

3566

return 0;

3571

return 0;

3567

}

3572

}

3568

3573

3569

current_nid = page_to_nid(page);

3574

current_nid = page_to_nid(page);

3570

target_nid = numa_migrate_prep(page, vma, addr, current_nid);

3575

target_nid = numa_migrate_prep(page, vma, addr, current_nid);

3571

pte_unmap_unlock(ptep, ptl);

3576

pte_unmap_unlock(ptep, ptl);

3572

if (target_nid == -1) {

3577

if (target_nid == -1) {

3573

/*

3578

/*

3574

* Account for the fault against the current node if it not

3579

* Account for the fault against the current node if it not

3575

* being replaced regardless of where the page is located.

3580

* being replaced regardless of where the page is located.

3576

*/

3581

*/

3577

current_nid = numa_node_id();

3582

current_nid = numa_node_id();

3578

put_page(page);

3583

put_page(page);

3579

goto out;

3584

goto out;

3580

}

3585

}

3581

3586

3582

/* Migrate to the requested node */

3587

/* Migrate to the requested node */

3583

migrated = migrate_misplaced_page(page, target_nid);

3588

migrated = migrate_misplaced_page(page, target_nid);

3584

if (migrated)

3589

if (migrated)

3585

current_nid = target_nid;

3590

current_nid = target_nid;

3586

3591

3587

out:

3592

out:

3588

if (current_nid != -1)

3593

if (current_nid != -1)

3589

task_numa_fault(current_nid, 1, migrated);

3594

task_numa_fault(current_nid, 1, migrated);

3590

return 0;

3595

return 0;

3591

}

3596

}

3592

3597

3593

/* NUMA hinting page fault entry point for regular pmds */

3598

/* NUMA hinting page fault entry point for regular pmds */

3594

#ifdef CONFIG_NUMA_BALANCING

3599

#ifdef CONFIG_NUMA_BALANCING

3595

static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3600

static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3596

unsigned long addr, pmd_t *pmdp)

3601

unsigned long addr, pmd_t *pmdp)

3597

{

3602

{

3598

pmd_t pmd;

3603

pmd_t pmd;

3599

pte_t *pte, *orig_pte;

3604

pte_t *pte, *orig_pte;

3600

unsigned long _addr = addr & PMD_MASK;

3605

unsigned long _addr = addr & PMD_MASK;

3601

unsigned long offset;

3606

unsigned long offset;

3602

spinlock_t *ptl;

3607

spinlock_t *ptl;

3603

bool numa = false;

3608

bool numa = false;

3604

int local_nid = numa_node_id();

3609

int local_nid = numa_node_id();

3605

3610

3606

spin_lock(&mm->page_table_lock);

3611

spin_lock(&mm->page_table_lock);

3607

pmd = *pmdp;

3612

pmd = *pmdp;

3608

if (pmd_numa(pmd)) {

3613

if (pmd_numa(pmd)) {

3609

set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));

3614

set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));

3610

numa = true;

3615

numa = true;

3611

}

3616

}

3612

spin_unlock(&mm->page_table_lock);

3617

spin_unlock(&mm->page_table_lock);

3613

3618

3614

if (!numa)

3619

if (!numa)

3615

return 0;

3620

return 0;

3616

3621

3617

/* we're in a page fault so some vma must be in the range */

3622

/* we're in a page fault so some vma must be in the range */

3618

BUG_ON(!vma);

3623

BUG_ON(!vma);

3619

BUG_ON(vma->vm_start >= _addr + PMD_SIZE);

3624

BUG_ON(vma->vm_start >= _addr + PMD_SIZE);

3620

offset = max(_addr, vma->vm_start) & ~PMD_MASK;

3625

offset = max(_addr, vma->vm_start) & ~PMD_MASK;

3621

VM_BUG_ON(offset >= PMD_SIZE);

3626

VM_BUG_ON(offset >= PMD_SIZE);

3622

orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);

3627

orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);

3623

pte += offset >> PAGE_SHIFT;

3628

pte += offset >> PAGE_SHIFT;

3624

for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {

3629

for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {

3625

pte_t pteval = *pte;

3630

pte_t pteval = *pte;

3626

struct page *page;

3631

struct page *page;

3627

int curr_nid = local_nid;

3632

int curr_nid = local_nid;

3628

int target_nid;

3633

int target_nid;

3629

bool migrated;

3634

bool migrated;

3630

if (!pte_present(pteval))

3635

if (!pte_present(pteval))

3631

continue;

3636

continue;

3632

if (!pte_numa(pteval))

3637

if (!pte_numa(pteval))

3633

continue;

3638

continue;

3634

if (addr >= vma->vm_end) {

3639

if (addr >= vma->vm_end) {

3635

vma = find_vma(mm, addr);

3640

vma = find_vma(mm, addr);

3636

/* there's a pte present so there must be a vma */

3641

/* there's a pte present so there must be a vma */

3637

BUG_ON(!vma);

3642

BUG_ON(!vma);

3638

BUG_ON(addr < vma->vm_start);

3643

BUG_ON(addr < vma->vm_start);

3639

}

3644

}

3640

if (pte_numa(pteval)) {

3645

if (pte_numa(pteval)) {

3641

pteval = pte_mknonnuma(pteval);

3646

pteval = pte_mknonnuma(pteval);

3642

set_pte_at(mm, addr, pte, pteval);

3647

set_pte_at(mm, addr, pte, pteval);

3643

}

3648

}

3644

page = vm_normal_page(vma, addr, pteval);

3649

page = vm_normal_page(vma, addr, pteval);

3645

if (unlikely(!page))

3650

if (unlikely(!page))

3646

continue;

3651

continue;

3647

/* only check non-shared pages */

3652

/* only check non-shared pages */

3648

if (unlikely(page_mapcount(page) != 1))

3653

if (unlikely(page_mapcount(page) != 1))

3649

continue;

3654

continue;

3650

3655

3651

/*

3656

/*

3652

* Note that the NUMA fault is later accounted to either

3657

* Note that the NUMA fault is later accounted to either

3653

* the node that is currently running or where the page is

3658

* the node that is currently running or where the page is

3654

* migrated to.

3659

* migrated to.

3655

*/

3660

*/

3656

curr_nid = local_nid;

3661

curr_nid = local_nid;

3657

target_nid = numa_migrate_prep(page, vma, addr,

3662

target_nid = numa_migrate_prep(page, vma, addr,

3658

page_to_nid(page));

3663

page_to_nid(page));

3659

if (target_nid == -1) {

3664

if (target_nid == -1) {

3660

put_page(page);

3665

put_page(page);

3661

continue;

3666

continue;

3662

}

3667

}

3663

3668

3664

/* Migrate to the requested node */

3669

/* Migrate to the requested node */

3665

pte_unmap_unlock(pte, ptl);

3670

pte_unmap_unlock(pte, ptl);

3666

migrated = migrate_misplaced_page(page, target_nid);

3671

migrated = migrate_misplaced_page(page, target_nid);

3667

if (migrated)

3672

if (migrated)

3668

curr_nid = target_nid;

3673

curr_nid = target_nid;

3669

task_numa_fault(curr_nid, 1, migrated);

3674

task_numa_fault(curr_nid, 1, migrated);

3670

3675

3671

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);

3676

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);

3672

}

3677

}

3673

pte_unmap_unlock(orig_pte, ptl);

3678

pte_unmap_unlock(orig_pte, ptl);

3674

3679

3675

return 0;

3680

return 0;

3676

}

3681

}

3677

#else

3682

#else

3678

static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3683

static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

3679

unsigned long addr, pmd_t *pmdp)

3684

unsigned long addr, pmd_t *pmdp)

3680

{

3685

{

3681

BUG();

3686

BUG();

3682

return 0;

3687

return 0;

3683

}

3688

}

3684

#endif /* CONFIG_NUMA_BALANCING */

3689

#endif /* CONFIG_NUMA_BALANCING */

3685

3690

3686

/*

3691

/*

3687

* These routines also need to handle stuff like marking pages dirty

3692

* These routines also need to handle stuff like marking pages dirty

3688

* and/or accessed for architectures that don't do it in hardware (most

3693

* and/or accessed for architectures that don't do it in hardware (most

3689

* RISC architectures). The early dirtying is also good on the i386.

3694

* RISC architectures). The early dirtying is also good on the i386.

3690

*

3695

*

3691

* There is also a hook called "update_mmu_cache()" that architectures

3696

* There is also a hook called "update_mmu_cache()" that architectures

3692

* with external mmu caches can use to update those (ie the Sparc or

3697

* with external mmu caches can use to update those (ie the Sparc or

3693

* PowerPC hashed page tables that act as extended TLBs).

3698

* PowerPC hashed page tables that act as extended TLBs).

3694

*

3699

*

3695

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3700

* We enter with non-exclusive mmap_sem (to exclude vma changes,

3696

* but allow concurrent faults), and pte mapped but not yet locked.

3701

* but allow concurrent faults), and pte mapped but not yet locked.

3697

* We return with mmap_sem still held, but pte unmapped and unlocked.

3702

* We return with mmap_sem still held, but pte unmapped and unlocked.

3698

*/

3703

*/

3699

int handle_pte_fault(struct mm_struct *mm,

3704

int handle_pte_fault(struct mm_struct *mm,

3700

struct vm_area_struct *vma, unsigned long address,

3705

struct vm_area_struct *vma, unsigned long address,

3701

pte_t *pte, pmd_t *pmd, unsigned int flags)

3706

pte_t *pte, pmd_t *pmd, unsigned int flags)

3702

{

3707

{

3703

pte_t entry;

3708

pte_t entry;

3704

spinlock_t *ptl;

3709

spinlock_t *ptl;

3705

3710

3706

entry = *pte;

3711

entry = *pte;

3707

if (!pte_present(entry)) {

3712

if (!pte_present(entry)) {

3708

if (pte_none(entry)) {

3713

if (pte_none(entry)) {

3709

if (vma->vm_ops) {

3714

if (vma->vm_ops) {

3710

if (likely(vma->vm_ops->fault))

3715

if (likely(vma->vm_ops->fault))

3711

return do_linear_fault(mm, vma, address,

3716

return do_linear_fault(mm, vma, address,

3712

pte, pmd, flags, entry);

3717

pte, pmd, flags, entry);

3713

}

3718

}

3714

return do_anonymous_page(mm, vma, address,

3719

return do_anonymous_page(mm, vma, address,

3715

pte, pmd, flags);

3720

pte, pmd, flags);

3716

}

3721

}

3717

if (pte_file(entry))

3722

if (pte_file(entry))

3718

return do_nonlinear_fault(mm, vma, address,

3723

return do_nonlinear_fault(mm, vma, address,

3719

pte, pmd, flags, entry);

3724

pte, pmd, flags, entry);

3720

return do_swap_page(mm, vma, address,

3725

return do_swap_page(mm, vma, address,

3721

pte, pmd, flags, entry);

3726

pte, pmd, flags, entry);

3722

}

3727

}

3723

3728

3724

if (pte_numa(entry))

3729

if (pte_numa(entry))

3725

return do_numa_page(mm, vma, address, entry, pte, pmd);

3730

return do_numa_page(mm, vma, address, entry, pte, pmd);

3726

3731

3727

ptl = pte_lockptr(mm, pmd);

3732

ptl = pte_lockptr(mm, pmd);

3728

spin_lock(ptl);

3733

spin_lock(ptl);

3729

if (unlikely(!pte_same(*pte, entry)))

3734

if (unlikely(!pte_same(*pte, entry)))

3730

goto unlock;

3735

goto unlock;

3731

if (flags & FAULT_FLAG_WRITE) {

3736

if (flags & FAULT_FLAG_WRITE) {

3732

if (!pte_write(entry))

3737

if (!pte_write(entry))

3733

return do_wp_page(mm, vma, address,

3738

return do_wp_page(mm, vma, address,

3734

pte, pmd, ptl, entry);

3739

pte, pmd, ptl, entry);

3735

entry = pte_mkdirty(entry);

3740

entry = pte_mkdirty(entry);

3736

}

3741

}

3737

entry = pte_mkyoung(entry);

3742

entry = pte_mkyoung(entry);

3738

if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {

3743

if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {

3739

update_mmu_cache(vma, address, pte);

3744

update_mmu_cache(vma, address, pte);

3740

} else {

3745

} else {

3741

/*

3746

/*

3742

* This is needed only for protection faults but the arch code

3747

* This is needed only for protection faults but the arch code

3743

* is not yet telling us if this is a protection fault or not.

3748

* is not yet telling us if this is a protection fault or not.

3744

* This still avoids useless tlb flushes for .text page faults

3749

* This still avoids useless tlb flushes for .text page faults

3745

* with threads.

3750

* with threads.

3746

*/

3751

*/

3747

if (flags & FAULT_FLAG_WRITE)

3752

if (flags & FAULT_FLAG_WRITE)

3748

flush_tlb_fix_spurious_fault(vma, address);

3753

flush_tlb_fix_spurious_fault(vma, address);

3749

}

3754

}

3750

unlock:

3755

unlock:

3751

pte_unmap_unlock(pte, ptl);

3756

pte_unmap_unlock(pte, ptl);

3752

return 0;

3757

return 0;

3753

}

3758

}

3754

3759

3755

/*

3760

/*

3756

* By the time we get here, we already hold the mm semaphore

3761

* By the time we get here, we already hold the mm semaphore

3757

*/

3762

*/

3758

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3763

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,

3759

unsigned long address, unsigned int flags)

3764

unsigned long address, unsigned int flags)

3760

{

3765

{

3761

pgd_t *pgd;

3766

pgd_t *pgd;

3762

pud_t *pud;

3767

pud_t *pud;

3763

pmd_t *pmd;

3768

pmd_t *pmd;

3764

pte_t *pte;

3769

pte_t *pte;

3765

3770

3766

__set_current_state(TASK_RUNNING);

3771

__set_current_state(TASK_RUNNING);

3767

3772

3768

count_vm_event(PGFAULT);

3773

count_vm_event(PGFAULT);

3769

mem_cgroup_count_vm_event(mm, PGFAULT);

3774

mem_cgroup_count_vm_event(mm, PGFAULT);

3770

3775

3771

/* do counter updates before entering really critical section. */

3776

/* do counter updates before entering really critical section. */

3772

check_sync_rss_stat(current);

3777

check_sync_rss_stat(current);

3773

3778

3774

if (unlikely(is_vm_hugetlb_page(vma)))

3779

if (unlikely(is_vm_hugetlb_page(vma)))

3775

return hugetlb_fault(mm, vma, address, flags);

3780

return hugetlb_fault(mm, vma, address, flags);

3776

3781

3777

retry:

3782

retry:

3778

pgd = pgd_offset(mm, address);

3783

pgd = pgd_offset(mm, address);

3779

pud = pud_alloc(mm, pgd, address);

3784

pud = pud_alloc(mm, pgd, address);

3780

if (!pud)

3785

if (!pud)

3781

return VM_FAULT_OOM;

3786

return VM_FAULT_OOM;

3782

pmd = pmd_alloc(mm, pud, address);

3787

pmd = pmd_alloc(mm, pud, address);

3783

if (!pmd)

3788

if (!pmd)

3784

return VM_FAULT_OOM;

3789

return VM_FAULT_OOM;

3785

if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {

3790

if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {

3786

if (!vma->vm_ops)

3791

if (!vma->vm_ops)

3787

return do_huge_pmd_anonymous_page(mm, vma, address,

3792

return do_huge_pmd_anonymous_page(mm, vma, address,

3788

pmd, flags);

3793

pmd, flags);

3789

} else {

3794

} else {

3790

pmd_t orig_pmd = *pmd;

3795

pmd_t orig_pmd = *pmd;

3791

int ret;

3796

int ret;

3792

3797

3793

barrier();

3798

barrier();

3794

if (pmd_trans_huge(orig_pmd)) {

3799

if (pmd_trans_huge(orig_pmd)) {

3795

unsigned int dirty = flags & FAULT_FLAG_WRITE;

3800

unsigned int dirty = flags & FAULT_FLAG_WRITE;

3796

3801

3797

/*

3802

/*

3798

* If the pmd is splitting, return and retry the

3803

* If the pmd is splitting, return and retry the

3799

* the fault. Alternative: wait until the split

3804

* the fault. Alternative: wait until the split

3800

* is done, and goto retry.

3805

* is done, and goto retry.

3801

*/

3806

*/

3802

if (pmd_trans_splitting(orig_pmd))

3807

if (pmd_trans_splitting(orig_pmd))

3803

return 0;

3808

return 0;

3804

3809

3805

if (pmd_numa(orig_pmd))

3810

if (pmd_numa(orig_pmd))

3806

return do_huge_pmd_numa_page(mm, vma, address,

3811

return do_huge_pmd_numa_page(mm, vma, address,

3807

orig_pmd, pmd);

3812

orig_pmd, pmd);

3808

3813

3809

if (dirty && !pmd_write(orig_pmd)) {

3814

if (dirty && !pmd_write(orig_pmd)) {

3810

ret = do_huge_pmd_wp_page(mm, vma, address, pmd,

3815

ret = do_huge_pmd_wp_page(mm, vma, address, pmd,

3811

orig_pmd);

3816

orig_pmd);

3812

/*

3817

/*

3813

* If COW results in an oom, the huge pmd will

3818

* If COW results in an oom, the huge pmd will

3814

* have been split, so retry the fault on the

3819

* have been split, so retry the fault on the

3815

* pte for a smaller charge.

3820

* pte for a smaller charge.

3816

*/

3821

*/

3817

if (unlikely(ret & VM_FAULT_OOM))

3822

if (unlikely(ret & VM_FAULT_OOM))

3818

goto retry;

3823

goto retry;

3819

return ret;

3824

return ret;

3820

} else {

3825

} else {

3821

huge_pmd_set_accessed(mm, vma, address, pmd,

3826

huge_pmd_set_accessed(mm, vma, address, pmd,

3822

orig_pmd, dirty);

3827

orig_pmd, dirty);

3823

}

3828

}

3824

3829

3825

return 0;

3830

return 0;

3826

}

3831

}

3827

}

3832

}

3828

3833

3829

if (pmd_numa(*pmd))

3834

if (pmd_numa(*pmd))

3830

return do_pmd_numa_page(mm, vma, address, pmd);

3835

return do_pmd_numa_page(mm, vma, address, pmd);

3831

3836

3832

/*

3837

/*

3833

* Use __pte_alloc instead of pte_alloc_map, because we can't

3838

* Use __pte_alloc instead of pte_alloc_map, because we can't

3834

* run pte_offset_map on the pmd, if an huge pmd could

3839

* run pte_offset_map on the pmd, if an huge pmd could

3835

* materialize from under us from a different thread.

3840

* materialize from under us from a different thread.

3836

*/

3841

*/

3837

if (unlikely(pmd_none(*pmd)) &&

3842

if (unlikely(pmd_none(*pmd)) &&

3838

unlikely(__pte_alloc(mm, vma, pmd, address)))

3843

unlikely(__pte_alloc(mm, vma, pmd, address)))

3839

return VM_FAULT_OOM;

3844

return VM_FAULT_OOM;

3840

/* if an huge pmd materialized from under us just retry later */

3845

/* if an huge pmd materialized from under us just retry later */

3841

if (unlikely(pmd_trans_huge(*pmd)))

3846

if (unlikely(pmd_trans_huge(*pmd)))

3842

return 0;

3847

return 0;

3843

/*

3848

/*

3844

* A regular pmd is established and it can't morph into a huge pmd

3849

* A regular pmd is established and it can't morph into a huge pmd

3845

* from under us anymore at this point because we hold the mmap_sem

3850

* from under us anymore at this point because we hold the mmap_sem

3846

* read mode and khugepaged takes it in write mode. So now it's

3851

* read mode and khugepaged takes it in write mode. So now it's

3847

* safe to run pte_offset_map().

3852

* safe to run pte_offset_map().

3848

*/

3853

*/

3849

pte = pte_offset_map(pmd, address);

3854

pte = pte_offset_map(pmd, address);

3850

3855

3851

return handle_pte_fault(mm, vma, address, pte, pmd, flags);

3856

return handle_pte_fault(mm, vma, address, pte, pmd, flags);

3852

}

3857

}

3853

3858

3854

#ifndef __PAGETABLE_PUD_FOLDED

3859

#ifndef __PAGETABLE_PUD_FOLDED

3855

/*

3860

/*

3856

* Allocate page upper directory.

3861

* Allocate page upper directory.

3857

* We've already handled the fast-path in-line.

3862

* We've already handled the fast-path in-line.

3858

*/

3863

*/

3859

int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)

3864

int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)

3860

{

3865

{

3861

pud_t *new = pud_alloc_one(mm, address);

3866

pud_t *new = pud_alloc_one(mm, address);

3862

if (!new)

3867

if (!new)

3863

return -ENOMEM;

3868

return -ENOMEM;

3864

3869

3865

smp_wmb(); /* See comment in __pte_alloc */

3870

smp_wmb(); /* See comment in __pte_alloc */

3866

3871

3867

spin_lock(&mm->page_table_lock);

3872

spin_lock(&mm->page_table_lock);

3868

if (pgd_present(*pgd)) /* Another has populated it */

3873

if (pgd_present(*pgd)) /* Another has populated it */

3869

pud_free(mm, new);

3874

pud_free(mm, new);

3870

else

3875

else

3871

pgd_populate(mm, pgd, new);

3876

pgd_populate(mm, pgd, new);

3872

spin_unlock(&mm->page_table_lock);

3877

spin_unlock(&mm->page_table_lock);

3873

return 0;

3878

return 0;

3874

}

3879

}

3875

#endif /* __PAGETABLE_PUD_FOLDED */

3880

#endif /* __PAGETABLE_PUD_FOLDED */

3876

3881

3877

#ifndef __PAGETABLE_PMD_FOLDED

3882

#ifndef __PAGETABLE_PMD_FOLDED

3878

/*

3883

/*

3879

* Allocate page middle directory.

3884

* Allocate page middle directory.

3880

* We've already handled the fast-path in-line.

3885

* We've already handled the fast-path in-line.

3881

*/

3886

*/

3882

int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)

3887

int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)

3883

{

3888

{

3884

pmd_t *new = pmd_alloc_one(mm, address);

3889

pmd_t *new = pmd_alloc_one(mm, address);

3885

if (!new)

3890

if (!new)

3886

return -ENOMEM;

3891

return -ENOMEM;

3887

3892

3888

smp_wmb(); /* See comment in __pte_alloc */

3893

smp_wmb(); /* See comment in __pte_alloc */

3889

3894

3890

spin_lock(&mm->page_table_lock);

3895

spin_lock(&mm->page_table_lock);

3891

#ifndef __ARCH_HAS_4LEVEL_HACK

3896

#ifndef __ARCH_HAS_4LEVEL_HACK

3892

if (pud_present(*pud)) /* Another has populated it */

3897

if (pud_present(*pud)) /* Another has populated it */

3893

pmd_free(mm, new);

3898

pmd_free(mm, new);

3894

else

3899

else

3895

pud_populate(mm, pud, new);

3900

pud_populate(mm, pud, new);

3896

#else

3901

#else

3897

if (pgd_present(*pud)) /* Another has populated it */

3902

if (pgd_present(*pud)) /* Another has populated it */

3898

pmd_free(mm, new);

3903

pmd_free(mm, new);

3899

else

3904

else

3900

pgd_populate(mm, pud, new);

3905

pgd_populate(mm, pud, new);

3901

#endif /* __ARCH_HAS_4LEVEL_HACK */

3906

#endif /* __ARCH_HAS_4LEVEL_HACK */

3902

spin_unlock(&mm->page_table_lock);

3907

spin_unlock(&mm->page_table_lock);

3903

return 0;

3908

return 0;

3904

}

3909

}

3905

#endif /* __PAGETABLE_PMD_FOLDED */

3910

#endif /* __PAGETABLE_PMD_FOLDED */

3906

3911

3907

#if !defined(__HAVE_ARCH_GATE_AREA)

3912

#if !defined(__HAVE_ARCH_GATE_AREA)

3908

3913

3909

#if defined(AT_SYSINFO_EHDR)

3914

#if defined(AT_SYSINFO_EHDR)

3910

static struct vm_area_struct gate_vma;

3915

static struct vm_area_struct gate_vma;

3911

3916

3912

static int __init gate_vma_init(void)

3917

static int __init gate_vma_init(void)

3913

{

3918

{

3914

gate_vma.vm_mm = NULL;

3919

gate_vma.vm_mm = NULL;

3915

gate_vma.vm_start = FIXADDR_USER_START;

3920

gate_vma.vm_start = FIXADDR_USER_START;

3916

gate_vma.vm_end = FIXADDR_USER_END;

3921

gate_vma.vm_end = FIXADDR_USER_END;

3917

gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;

3922

gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;

3918

gate_vma.vm_page_prot = __P101;

3923

gate_vma.vm_page_prot = __P101;

3919

3924

3920

return 0;

3925

return 0;

3921

}

3926

}

3922

__initcall(gate_vma_init);

3927

__initcall(gate_vma_init);

3923

#endif

3928

#endif

3924

3929

3925

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)

3930

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)

3926

{

3931

{

3927

#ifdef AT_SYSINFO_EHDR

3932

#ifdef AT_SYSINFO_EHDR

3928

return &gate_vma;

3933

return &gate_vma;

3929

#else

3934

#else

3930

return NULL;

3935

return NULL;

3931

#endif

3936

#endif

3932

}

3937

}

3933

3938

3934

int in_gate_area_no_mm(unsigned long addr)

3939

int in_gate_area_no_mm(unsigned long addr)

3935

{

3940

{

3936

#ifdef AT_SYSINFO_EHDR

3941

#ifdef AT_SYSINFO_EHDR

3937

if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))

3942

if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))

3938

return 1;

3943

return 1;

3939

#endif

3944

#endif

3940

return 0;

3945

return 0;

3941

}

3946

}

3942

3947

3943

#endif /* __HAVE_ARCH_GATE_AREA */

3948

#endif /* __HAVE_ARCH_GATE_AREA */

3944

3949

3945

static int __follow_pte(struct mm_struct *mm, unsigned long address,

3950

static int __follow_pte(struct mm_struct *mm, unsigned long address,

3946

pte_t **ptepp, spinlock_t **ptlp)

3951

pte_t **ptepp, spinlock_t **ptlp)

3947

{

3952

{

3948

pgd_t *pgd;

3953

pgd_t *pgd;

3949

pud_t *pud;

3954

pud_t *pud;

3950

pmd_t *pmd;

3955

pmd_t *pmd;

3951

pte_t *ptep;

3956

pte_t *ptep;

3952

3957

3953

pgd = pgd_offset(mm, address);

3958

pgd = pgd_offset(mm, address);

3954

if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))

3959

if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))

3955

goto out;

3960

goto out;

3956

3961

3957

pud = pud_offset(pgd, address);

3962

pud = pud_offset(pgd, address);

3958

if (pud_none(*pud) || unlikely(pud_bad(*pud)))

3963

if (pud_none(*pud) || unlikely(pud_bad(*pud)))

3959

goto out;

3964

goto out;

3960

3965

3961

pmd = pmd_offset(pud, address);

3966

pmd = pmd_offset(pud, address);

3962

VM_BUG_ON(pmd_trans_huge(*pmd));

3967

VM_BUG_ON(pmd_trans_huge(*pmd));

3963

if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))

3968

if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))

3964

goto out;

3969

goto out;

3965

3970

3966

/* We cannot handle huge page PFN maps. Luckily they don't exist. */

3971

/* We cannot handle huge page PFN maps. Luckily they don't exist. */

3967

if (pmd_huge(*pmd))

3972

if (pmd_huge(*pmd))

3968

goto out;

3973

goto out;

3969

3974

3970

ptep = pte_offset_map_lock(mm, pmd, address, ptlp);

3975

ptep = pte_offset_map_lock(mm, pmd, address, ptlp);

3971

if (!ptep)

3976

if (!ptep)

3972

goto out;

3977

goto out;

3973

if (!pte_present(*ptep))

3978

if (!pte_present(*ptep))

3974

goto unlock;

3979

goto unlock;

3975

*ptepp = ptep;

3980

*ptepp = ptep;

3976

return 0;

3981

return 0;

3977

unlock:

3982

unlock:

3978

pte_unmap_unlock(ptep, *ptlp);

3983

pte_unmap_unlock(ptep, *ptlp);

3979

out:

3984

out:

3980

return -EINVAL;

3985

return -EINVAL;

3981

}

3986

}

3982

3987

3983

static inline int follow_pte(struct mm_struct *mm, unsigned long address,

3988

static inline int follow_pte(struct mm_struct *mm, unsigned long address,

3984

pte_t **ptepp, spinlock_t **ptlp)

3989

pte_t **ptepp, spinlock_t **ptlp)

3985

{

3990

{

3986

int res;

3991

int res;

3987

3992

3988

/* (void) is needed to make gcc happy */

3993

/* (void) is needed to make gcc happy */

3989

(void) __cond_lock(*ptlp,

3994

(void) __cond_lock(*ptlp,

3990

!(res = __follow_pte(mm, address, ptepp, ptlp)));

3995

!(res = __follow_pte(mm, address, ptepp, ptlp)));

3991

return res;

3996

return res;

3992

}

3997

}

3993

3998

3994

/**

3999

/**

3995

* follow_pfn - look up PFN at a user virtual address

4000

* follow_pfn - look up PFN at a user virtual address

3996

* @vma: memory mapping

4001

* @vma: memory mapping

3997

* @address: user virtual address

4002

* @address: user virtual address

3998

* @pfn: location to store found PFN

4003

* @pfn: location to store found PFN

3999

*

4004

*

4000

* Only IO mappings and raw PFN mappings are allowed.

4005

* Only IO mappings and raw PFN mappings are allowed.

4001

*

4006

*

4002

* Returns zero and the pfn at @pfn on success, -ve otherwise.

4007

* Returns zero and the pfn at @pfn on success, -ve otherwise.

4003

*/

4008

*/

4004

int follow_pfn(struct vm_area_struct *vma, unsigned long address,

4009

int follow_pfn(struct vm_area_struct *vma, unsigned long address,

4005

unsigned long *pfn)

4010

unsigned long *pfn)

4006

{

4011

{

4007

int ret = -EINVAL;

4012

int ret = -EINVAL;

4008

spinlock_t *ptl;

4013

spinlock_t *ptl;

4009

pte_t *ptep;

4014

pte_t *ptep;

4010

4015

4011

if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))

4016

if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))

4012

return ret;

4017

return ret;

4013

4018

4014

ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);

4019

ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);

4015

if (ret)

4020

if (ret)

4016

return ret;

4021

return ret;

4017

*pfn = pte_pfn(*ptep);

4022

*pfn = pte_pfn(*ptep);

4018

pte_unmap_unlock(ptep, ptl);

4023

pte_unmap_unlock(ptep, ptl);

4019

return 0;

4024

return 0;

4020

}

4025

}

4021

EXPORT_SYMBOL(follow_pfn);

4026

EXPORT_SYMBOL(follow_pfn);

4022

4027

4023

#ifdef CONFIG_HAVE_IOREMAP_PROT

4028

#ifdef CONFIG_HAVE_IOREMAP_PROT

4024

int follow_phys(struct vm_area_struct *vma,

4029

int follow_phys(struct vm_area_struct *vma,

4025

unsigned long address, unsigned int flags,

4030

unsigned long address, unsigned int flags,

4026

unsigned long *prot, resource_size_t *phys)

4031

unsigned long *prot, resource_size_t *phys)

4027

{

4032

{

4028

int ret = -EINVAL;

4033

int ret = -EINVAL;

4029

pte_t *ptep, pte;

4034

pte_t *ptep, pte;

4030

spinlock_t *ptl;

4035

spinlock_t *ptl;

4031

4036

4032

if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))

4037

if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))

4033

goto out;

4038

goto out;

4034

4039

4035

if (follow_pte(vma->vm_mm, address, &ptep, &ptl))

4040

if (follow_pte(vma->vm_mm, address, &ptep, &ptl))

4036

goto out;

4041

goto out;

4037

pte = *ptep;

4042

pte = *ptep;

4038

4043

4039

if ((flags & FOLL_WRITE) && !pte_write(pte))

4044

if ((flags & FOLL_WRITE) && !pte_write(pte))

4040

goto unlock;

4045

goto unlock;

4041

4046

4042

*prot = pgprot_val(pte_pgprot(pte));

4047

*prot = pgprot_val(pte_pgprot(pte));

4043

*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

4048

*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

4044

4049

4045

ret = 0;

4050

ret = 0;

4046

unlock:

4051

unlock:

4047

pte_unmap_unlock(ptep, ptl);

4052

pte_unmap_unlock(ptep, ptl);

4048

out:

4053

out:

4049

return ret;

4054

return ret;

4050

}

4055

}

4051

4056

4052

int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,

4057

int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,

4053

void *buf, int len, int write)

4058

void *buf, int len, int write)

4054

{

4059

{

4055

resource_size_t phys_addr;

4060

resource_size_t phys_addr;

4056

unsigned long prot = 0;

4061

unsigned long prot = 0;

4057

void __iomem *maddr;

4062

void __iomem *maddr;

4058

int offset = addr & (PAGE_SIZE-1);

4063

int offset = addr & (PAGE_SIZE-1);

4059

4064

4060

if (follow_phys(vma, addr, write, &prot, &phys_addr))

4065

if (follow_phys(vma, addr, write, &prot, &phys_addr))

4061

return -EINVAL;

4066

return -EINVAL;

4062

4067

4063

maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);

4068

maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);

4064

if (write)

4069

if (write)

4065

memcpy_toio(maddr + offset, buf, len);

4070

memcpy_toio(maddr + offset, buf, len);

4066

else

4071

else

4067

memcpy_fromio(buf, maddr + offset, len);

4072

memcpy_fromio(buf, maddr + offset, len);

4068

iounmap(maddr);

4073

iounmap(maddr);

4069

4074

4070

return len;

4075

return len;

4071

}

4076

}

4072

#endif

4077

#endif

4073

4078

4074

/*

4079

/*

4075

* Access another process' address space as given in mm. If non-NULL, use the

4080

* Access another process' address space as given in mm. If non-NULL, use the

4076

* given task for page fault accounting.

4081

* given task for page fault accounting.

4077

*/

4082

*/

4078

static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,

4083

static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,

4079

unsigned long addr, void *buf, int len, int write)

4084

unsigned long addr, void *buf, int len, int write)

4080

{

4085

{

4081

struct vm_area_struct *vma;

4086

struct vm_area_struct *vma;

4082

void *old_buf = buf;

4087

void *old_buf = buf;

4083

4088

4084

down_read(&mm->mmap_sem);

4089

down_read(&mm->mmap_sem);

4085

/* ignore errors, just check how much was successfully transferred */

4090

/* ignore errors, just check how much was successfully transferred */

4086

while (len) {

4091

while (len) {

4087

int bytes, ret, offset;

4092

int bytes, ret, offset;

4088

void *maddr;

4093

void *maddr;

4089

struct page *page = NULL;

4094

struct page *page = NULL;

4090

4095

4091

ret = get_user_pages(tsk, mm, addr, 1,

4096

ret = get_user_pages(tsk, mm, addr, 1,

4092

write, 1, &page, &vma);

4097

write, 1, &page, &vma);

4093

if (ret <= 0) {

4098

if (ret <= 0) {

4094

/*

4099

/*

4095

* Check if this is a VM_IO | VM_PFNMAP VMA, which

4100

* Check if this is a VM_IO | VM_PFNMAP VMA, which

4096

* we can access using slightly different code.

4101

* we can access using slightly different code.

4097

*/

4102

*/

4098

#ifdef CONFIG_HAVE_IOREMAP_PROT

4103

#ifdef CONFIG_HAVE_IOREMAP_PROT

4099

vma = find_vma(mm, addr);

4104

vma = find_vma(mm, addr);

4100

if (!vma || vma->vm_start > addr)

4105

if (!vma || vma->vm_start > addr)

4101

break;

4106

break;

4102

if (vma->vm_ops && vma->vm_ops->access)

4107

if (vma->vm_ops && vma->vm_ops->access)

4103

ret = vma->vm_ops->access(vma, addr, buf,

4108

ret = vma->vm_ops->access(vma, addr, buf,

4104

len, write);

4109

len, write);

4105

if (ret <= 0)

4110

if (ret <= 0)

4106

#endif

4111

#endif

4107

break;

4112

break;

4108

bytes = ret;

4113

bytes = ret;

4109

} else {

4114

} else {

4110

bytes = len;

4115

bytes = len;

4111

offset = addr & (PAGE_SIZE-1);

4116

offset = addr & (PAGE_SIZE-1);

4112

if (bytes > PAGE_SIZE-offset)

4117

if (bytes > PAGE_SIZE-offset)

4113

bytes = PAGE_SIZE-offset;

4118

bytes = PAGE_SIZE-offset;

4114

4119

4115

maddr = kmap(page);

4120

maddr = kmap(page);

4116

if (write) {

4121

if (write) {

4117

copy_to_user_page(vma, page, addr,

4122

copy_to_user_page(vma, page, addr,

4118

maddr + offset, buf, bytes);

4123

maddr + offset, buf, bytes);

4119

set_page_dirty_lock(page);

4124

set_page_dirty_lock(page);

4120

} else {

4125

} else {

4121

copy_from_user_page(vma, page, addr,

4126

copy_from_user_page(vma, page, addr,

4122

buf, maddr + offset, bytes);

4127

buf, maddr + offset, bytes);

4123

}

4128

}

4124

kunmap(page);

4129

kunmap(page);

4125

page_cache_release(page);

4130

page_cache_release(page);

4126

}

4131

}

4127

len -= bytes;

4132

len -= bytes;

4128

buf += bytes;

4133

buf += bytes;

4129

addr += bytes;

4134

addr += bytes;

4130

}

4135

}

4131

up_read(&mm->mmap_sem);

4136

up_read(&mm->mmap_sem);

4132

4137

4133

return buf - old_buf;

4138

return buf - old_buf;

4134

}

4139

}

4135

4140

4136

/**

4141

/**

4137

* access_remote_vm - access another process' address space

4142

* access_remote_vm - access another process' address space

4138

* @mm: the mm_struct of the target address space

4143

* @mm: the mm_struct of the target address space

4139

* @addr: start address to access

4144

* @addr: start address to access

4140

* @buf: source or destination buffer

4145

* @buf: source or destination buffer

4141

* @len: number of bytes to transfer

4146

* @len: number of bytes to transfer

4142

* @write: whether the access is a write

4147

* @write: whether the access is a write

4143

*

4148

*

4144

* The caller must hold a reference on @mm.

4149

* The caller must hold a reference on @mm.

4145

*/

4150

*/

4146

int access_remote_vm(struct mm_struct *mm, unsigned long addr,

4151

int access_remote_vm(struct mm_struct *mm, unsigned long addr,

4147

void *buf, int len, int write)

4152

void *buf, int len, int write)

4148

{

4153

{

4149

return __access_remote_vm(NULL, mm, addr, buf, len, write);

4154

return __access_remote_vm(NULL, mm, addr, buf, len, write);

4150

}

4155

}

4151

4156

4152

/*

4157

/*

4153

* Access another process' address space.

4158

* Access another process' address space.

4154

* Source/target buffer must be kernel space,

4159

* Source/target buffer must be kernel space,

4155

* Do not walk the page table directly, use get_user_pages

4160

* Do not walk the page table directly, use get_user_pages

4156

*/

4161

*/

4157

int access_process_vm(struct task_struct *tsk, unsigned long addr,

4162

int access_process_vm(struct task_struct *tsk, unsigned long addr,

4158

void *buf, int len, int write)

4163

void *buf, int len, int write)

4159

{

4164

{

4160

struct mm_struct *mm;

4165

struct mm_struct *mm;

4161

int ret;

4166

int ret;

4162

4167

4163

mm = get_task_mm(tsk);

4168

mm = get_task_mm(tsk);

4164

if (!mm)

4169

if (!mm)

4165

return 0;

4170

return 0;

4166

4171

4167

ret = __access_remote_vm(tsk, mm, addr, buf, len, write);

4172

ret = __access_remote_vm(tsk, mm, addr, buf, len, write);

4168

mmput(mm);

4173

mmput(mm);

4169

4174

4170

return ret;

4175

return ret;

4171

}

4176

}

4172

4177

4173

/*

4178

/*

4174

* Print the name of a VMA.

4179

* Print the name of a VMA.

4175

*/

4180

*/

4176

void print_vma_addr(char *prefix, unsigned long ip)

4181

void print_vma_addr(char *prefix, unsigned long ip)

4177

{

4182

{

4178

struct mm_struct *mm = current->mm;

4183

struct mm_struct *mm = current->mm;

4179

struct vm_area_struct *vma;

4184

struct vm_area_struct *vma;

4180

4185

4181

/*

4186

/*

4182

* Do not print if we are in atomic

4187

* Do not print if we are in atomic

4183

* contexts (in exception stacks, etc.):

4188

* contexts (in exception stacks, etc.):

4184

*/

4189

*/

4185

if (preempt_count())

4190

if (preempt_count())

4186

return;

4191

return;

4187

4192

4188

down_read(&mm->mmap_sem);

4193

down_read(&mm->mmap_sem);

4189

vma = find_vma(mm, ip);

4194

vma = find_vma(mm, ip);

4190

if (vma && vma->vm_file) {

4195

if (vma && vma->vm_file) {

4191

struct file *f = vma->vm_file;

4196

struct file *f = vma->vm_file;

4192

char *buf = (char *)__get_free_page(GFP_KERNEL);

4197

char *buf = (char *)__get_free_page(GFP_KERNEL);

4193

if (buf) {

4198

if (buf) {

4194

char *p;

4199

char *p;

4195

4200

4196

p = d_path(&f->f_path, buf, PAGE_SIZE);

4201

p = d_path(&f->f_path, buf, PAGE_SIZE);

4197

if (IS_ERR(p))

4202

if (IS_ERR(p))

4198

p = "?";

4203

p = "?";

4199

printk("%s%s[%lx+%lx]", prefix, kbasename(p),

4204

printk("%s%s[%lx+%lx]", prefix, kbasename(p),

4200

vma->vm_start,

4205

vma->vm_start,

4201

vma->vm_end - vma->vm_start);

4206

vma->vm_end - vma->vm_start);

4202

free_page((unsigned long)buf);

4207

free_page((unsigned long)buf);

4203

}

4208

}

4204

}

4209

}

4205

up_read(&mm->mmap_sem);

4210

up_read(&mm->mmap_sem);

4206

}

4211

}

4207

4212

4208

#ifdef CONFIG_PROVE_LOCKING

4213

#ifdef CONFIG_PROVE_LOCKING

4209

void might_fault(void)

4214

void might_fault(void)

4210

{

4215

{

4211

/*

4216

/*

4212

* Some code (nfs/sunrpc) uses socket ops on kernel memory while

4217

* Some code (nfs/sunrpc) uses socket ops on kernel memory while

4213

* holding the mmap_sem, this is safe because kernel memory doesn't

4218

* holding the mmap_sem, this is safe because kernel memory doesn't

4214

* get paged out, therefore we'll never actually fault, and the

4219

* get paged out, therefore we'll never actually fault, and the

4215

* below annotations will generate false positives.

4220

* below annotations will generate false positives.

4216

*/

4221

*/

4217

if (segment_eq(get_fs(), KERNEL_DS))

4222

if (segment_eq(get_fs(), KERNEL_DS))

4218

return;

4223

return;

4219

4224

4220

might_sleep();

4225

might_sleep();

4221

/*

4226

/*

4222

* it would be nicer only to annotate paths which are not under

4227

* it would be nicer only to annotate paths which are not under

4223

* pagefault_disable, however that requires a larger audit and

4228

* pagefault_disable, however that requires a larger audit and

4224

* providing helpers like get_user_atomic.

4229

* providing helpers like get_user_atomic.

4225

*/

4230

*/

4226

if (!in_atomic() && current->mm)

4231

if (!in_atomic() && current->mm)

4227

might_lock_read(&current->mm->mmap_sem);

4232

might_lock_read(&current->mm->mmap_sem);

4228

}

4233

}

4229

EXPORT_SYMBOL(might_fault);

4234

EXPORT_SYMBOL(might_fault);

4230

#endif

4235

#endif

4231

4236

4232

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)

4237

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)

4233

static void clear_gigantic_page(struct page *page,

4238

static void clear_gigantic_page(struct page *page,

4234

unsigned long addr,

4239

unsigned long addr,

4235

unsigned int pages_per_huge_page)

4240

unsigned int pages_per_huge_page)

4236

{

4241

{

4237

int i;

4242

int i;

4238

struct page *p = page;

4243

struct page *p = page;

4239

4244

4240

might_sleep();

4245

might_sleep();

4241

for (i = 0; i < pages_per_huge_page;

4246

for (i = 0; i < pages_per_huge_page;

4242

i++, p = mem_map_next(p, page, i)) {

4247

i++, p = mem_map_next(p, page, i)) {

4243

cond_resched();

4248

cond_resched();

4244

clear_user_highpage(p, addr + i * PAGE_SIZE);

4249

clear_user_highpage(p, addr + i * PAGE_SIZE);

4245

}

4250

}

4246

}

4251

}

4247

void clear_huge_page(struct page *page,

4252

void clear_huge_page(struct page *page,

4248

unsigned long addr, unsigned int pages_per_huge_page)

4253

unsigned long addr, unsigned int pages_per_huge_page)

4249

{

4254

{

4250

int i;

4255

int i;

4251

4256

4252

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {

4257

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {

4253

clear_gigantic_page(page, addr, pages_per_huge_page);

4258

clear_gigantic_page(page, addr, pages_per_huge_page);

4254

return;

4259

return;

4255

}

4260

}

4256

4261

4257

might_sleep();

4262

might_sleep();

4258

for (i = 0; i < pages_per_huge_page; i++) {

4263

for (i = 0; i < pages_per_huge_page; i++) {

4259

cond_resched();

4264

cond_resched();

4260

clear_user_highpage(page + i, addr + i * PAGE_SIZE);

4265

clear_user_highpage(page + i, addr + i * PAGE_SIZE);

4261

}

4266

}

4262

}

4267

}

4263

4268

4264

static void copy_user_gigantic_page(struct page *dst, struct page *src,

4269

static void copy_user_gigantic_page(struct page *dst, struct page *src,

4265

unsigned long addr,

4270

unsigned long addr,

4266

struct vm_area_struct *vma,

4271

struct vm_area_struct *vma,

4267

unsigned int pages_per_huge_page)

4272

unsigned int pages_per_huge_page)

4268

{

4273

{

4269

int i;

4274

int i;

4270

struct page *dst_base = dst;

4275

struct page *dst_base = dst;

4271

struct page *src_base = src;

4276

struct page *src_base = src;

4272

4277

4273

for (i = 0; i < pages_per_huge_page; ) {

4278

for (i = 0; i < pages_per_huge_page; ) {

4274

cond_resched();

4279

cond_resched();

4275

copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

4280

copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

4276

4281

4277

i++;

4282

i++;

4278

dst = mem_map_next(dst, dst_base, i);

4283

dst = mem_map_next(dst, dst_base, i);

4279

src = mem_map_next(src, src_base, i);

4284

src = mem_map_next(src, src_base, i);

4280

}

4285

}

4281

}

4286

}

4282

4287

4283

void copy_user_huge_page(struct page *dst, struct page *src,

4288

void copy_user_huge_page(struct page *dst, struct page *src,

4284

unsigned long addr, struct vm_area_struct *vma,

4289

unsigned long addr, struct vm_area_struct *vma,

4285

unsigned int pages_per_huge_page)

4290

unsigned int pages_per_huge_page)

4286

{

4291

{

4287

int i;

4292

int i;

4288

4293

4289

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {

4294

if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {

4290

copy_user_gigantic_page(dst, src, addr, vma,

4295

copy_user_gigantic_page(dst, src, addr, vma,

4291

pages_per_huge_page);

4296

pages_per_huge_page);

4292

return;

4297

return;

4293

}

4298

}

4294

4299

4295

might_sleep();

4300

might_sleep();

4296

for (i = 0; i < pages_per_huge_page; i++) {

4301

for (i = 0; i < pages_per_huge_page; i++) {

4297

cond_resched();

4302

cond_resched();

4298

copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);

4303

copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);

4299

}

4304

}

4300

}

4305

}

4301

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

4306

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

4302

4307

GITLAB

THP: fix comment about memory barrier

 /*
  *  Copyright (C) 2009  Red Hat, Inc.
  *
  *  This work is licensed under the terms of the GNU GPL, version 2. See
  *  the COPYING file in the top-level directory.
  */
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
 /*
  * By default transparent hugepage support is enabled for all mappings
  * and khugepaged scans all mappings. Defrag is only invoked by
  * khugepaged hugepage allocations and by page faults inside
  * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
  * allocations.
  */
 unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
 	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 /* default scan 8*512 pte (or vmas) every 30 second */
 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
 static unsigned int khugepaged_pages_collapsed;
 static unsigned int khugepaged_full_scans;
 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
 /* during fragmentation poll the hugepage allocator once every minute */
 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
 static struct task_struct *khugepaged_thread __read_mostly;
 static DEFINE_MUTEX(khugepaged_mutex);
 static DEFINE_SPINLOCK(khugepaged_mm_lock);
 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 /*
  * default collapse hugepages if there is at least one pte mapped like
  * it would have happened if the vma was large enough during page
  * fault.
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 static struct kmem_cache *mm_slot_cache __read_mostly;
 /**
  * struct mm_slot - hash lookup from mm to mm_slot
  * @hash: hash collision list
  * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
  * @mm: the mm that this information is valid for
  */
 struct mm_slot {
 	struct hlist_node hash;
 	struct list_head mm_node;
 	struct mm_struct *mm;
 };
 /**
  * struct khugepaged_scan - cursor for scanning
  * @mm_head: the head of the mm list to scan
  * @mm_slot: the current mm_slot we are scanning
  * @address: the next address inside that to be scanned
  *
  * There is only the one khugepaged_scan instance of this cursor structure.
  */
 struct khugepaged_scan {
 	struct list_head mm_head;
 	struct mm_slot *mm_slot;
 	unsigned long address;
 };
 static struct khugepaged_scan khugepaged_scan = {
 	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 static int set_recommended_min_free_kbytes(void)
 {
 	struct zone *zone;
 	int nr_zones = 0;
 	unsigned long recommended_min;
 	if (!khugepaged_enabled())
 		return 0;
 	for_each_populated_zone(zone)
 		nr_zones++;
 	/* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
 	recommended_min = pageblock_nr_pages * nr_zones * 2;
 	/*
 	 * Make sure that on average at least two pageblocks are almost free
 	 * of another type, one for a migratetype to fall back to and a
 	 * second to avoid subsequent fallbacks of other types There are 3
 	 * MIGRATE_TYPES we care about.
 	 */
 	recommended_min += pageblock_nr_pages * nr_zones *
 			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
 	/* don't ever allow to reserve more than 5% of the lowmem */
 	recommended_min = min(recommended_min,
 			      (unsigned long) nr_free_buffer_pages() / 20);
 	recommended_min <<= (PAGE_SHIFT-10);
 	if (recommended_min > min_free_kbytes)
 		min_free_kbytes = recommended_min;
 	setup_per_zone_wmarks();
 	return 0;
 }
 late_initcall(set_recommended_min_free_kbytes);
 static int start_khugepaged(void)
 {
 	int err = 0;
 	if (khugepaged_enabled()) {
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
 		if (unlikely(IS_ERR(khugepaged_thread))) {
 			printk(KERN_ERR
 			       "khugepaged: kthread_run(khugepaged) failed\n");
 			err = PTR_ERR(khugepaged_thread);
 			khugepaged_thread = NULL;
 		}
 		if (!list_empty(&khugepaged_scan.mm_head))
 			wake_up_interruptible(&khugepaged_wait);
 		set_recommended_min_free_kbytes();
 	} else if (khugepaged_thread) {
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
 	}
 	return err;
 }
 static atomic_t huge_zero_refcount;
 static unsigned long huge_zero_pfn __read_mostly;
 static inline bool is_huge_zero_pfn(unsigned long pfn)
 {
 	unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
 	return zero_pfn && pfn == zero_pfn;
 }
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
 	return is_huge_zero_pfn(pmd_pfn(pmd));
 }
 static unsigned long get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
 		return ACCESS_ONCE(huge_zero_pfn);
 	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
 			HPAGE_PMD_ORDER);
 	if (!zero_page) {
 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
 		return 0;
 	}
 	count_vm_event(THP_ZERO_PAGE_ALLOC);
 	preempt_disable();
 	if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
 		preempt_enable();
 		__free_page(zero_page);
 		goto retry;
 	}
 	/* We take additional reference here. It will be put back by shrinker */
 	atomic_set(&huge_zero_refcount, 2);
 	preempt_enable();
 	return ACCESS_ONCE(huge_zero_pfn);
 }
 static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
 	 * last reference.
 	 */
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 static int shrink_huge_zero_page(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
 	if (!sc->nr_to_scan)
 		/* we can free zero page only if last reference remains */
 		return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 		unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
 		BUG_ON(zero_pfn == 0);
 		__free_page(__pfn_to_page(zero_pfn));
 	}
 	return 0;
 }
 static struct shrinker huge_zero_page_shrinker = {
 	.shrink = shrink_huge_zero_page,
 	.seeks = DEFAULT_SEEKS,
 };
 #ifdef CONFIG_SYSFS
 static ssize_t double_flag_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf,
 				enum transparent_hugepage_flag enabled,
 				enum transparent_hugepage_flag req_madv)
 {
 	if (test_bit(enabled, &transparent_hugepage_flags)) {
 		VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
 		return sprintf(buf, "[always] madvise never\n");
 	} else if (test_bit(req_madv, &transparent_hugepage_flags))
 		return sprintf(buf, "always [madvise] never\n");
 	else
 		return sprintf(buf, "always madvise [never]\n");
 }
 static ssize_t double_flag_store(struct kobject *kobj,
 				 struct kobj_attribute *attr,
 				 const char *buf, size_t count,
 				 enum transparent_hugepage_flag enabled,
 				 enum transparent_hugepage_flag req_madv)
 {
 	if (!memcmp("always", buf,
 		    min(sizeof("always")-1, count))) {
 		set_bit(enabled, &transparent_hugepage_flags);
 		clear_bit(req_madv, &transparent_hugepage_flags);
 	} else if (!memcmp("madvise", buf,
 			   min(sizeof("madvise")-1, count))) {
 		clear_bit(enabled, &transparent_hugepage_flags);
 		set_bit(req_madv, &transparent_hugepage_flags);
 	} else if (!memcmp("never", buf,
 			   min(sizeof("never")-1, count))) {
 		clear_bit(enabled, &transparent_hugepage_flags);
 		clear_bit(req_madv, &transparent_hugepage_flags);
 	} else
 		return -EINVAL;
 	return count;
 }
 static ssize_t enabled_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buf)
 {
 	return double_flag_show(kobj, attr, buf,
 				TRANSPARENT_HUGEPAGE_FLAG,
 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 }
 static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
 	ssize_t ret;
 	ret = double_flag_store(kobj, attr, buf, count,
 				TRANSPARENT_HUGEPAGE_FLAG,
 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 	if (ret > 0) {
 		int err;
 		mutex_lock(&khugepaged_mutex);
 		err = start_khugepaged();
 		mutex_unlock(&khugepaged_mutex);
 		if (err)
 			ret = err;
 	}
 	return ret;
 }
 static struct kobj_attribute enabled_attr =
 	__ATTR(enabled, 0644, enabled_show, enabled_store);
 static ssize_t single_flag_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf,
 				enum transparent_hugepage_flag flag)
 {
 	return sprintf(buf, "%d\n",
 		       !!test_bit(flag, &transparent_hugepage_flags));
 }
 static ssize_t single_flag_store(struct kobject *kobj,
 				 struct kobj_attribute *attr,
 				 const char *buf, size_t count,
 				 enum transparent_hugepage_flag flag)
 {
 	unsigned long value;
 	int ret;
 	ret = kstrtoul(buf, 10, &value);
 	if (ret < 0)
 		return ret;
 	if (value > 1)
 		return -EINVAL;
 	if (value)
 		set_bit(flag, &transparent_hugepage_flags);
 	else
 		clear_bit(flag, &transparent_hugepage_flags);
 	return count;
 }
 /*
  * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
  * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
  * memory just to allocate one more hugepage.
  */
 static ssize_t defrag_show(struct kobject *kobj,
 			   struct kobj_attribute *attr, char *buf)
 {
 	return double_flag_show(kobj, attr, buf,
 				TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 				TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 }
 static ssize_t defrag_store(struct kobject *kobj,
 			    struct kobj_attribute *attr,
 			    const char *buf, size_t count)
 {
 	return double_flag_store(kobj, attr, buf, count,
 				 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 }
 static struct kobj_attribute defrag_attr =
 	__ATTR(defrag, 0644, defrag_show, defrag_store);
 static ssize_t use_zero_page_show(struct kobject *kobj,
 		struct kobj_attribute *attr, char *buf)
 {
 	return single_flag_show(kobj, attr, buf,
 				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 }
 static ssize_t use_zero_page_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
 	return single_flag_store(kobj, attr, buf, count,
 				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 }
 static struct kobj_attribute use_zero_page_attr =
 	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
 #ifdef CONFIG_DEBUG_VM
 static ssize_t debug_cow_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf)
 {
 	return single_flag_show(kobj, attr, buf,
 				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 }
 static ssize_t debug_cow_store(struct kobject *kobj,
 			       struct kobj_attribute *attr,
 			       const char *buf, size_t count)
 {
 	return single_flag_store(kobj, attr, buf, count,
 				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 }
 static struct kobj_attribute debug_cow_attr =
 	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
 #endif /* CONFIG_DEBUG_VM */
 static struct attribute *hugepage_attr[] = {
 	&enabled_attr.attr,
 	&defrag_attr.attr,
 	&use_zero_page_attr.attr,
 #ifdef CONFIG_DEBUG_VM
 	&debug_cow_attr.attr,
 #endif
 	NULL,
 };
 static struct attribute_group hugepage_attr_group = {
 	.attrs = hugepage_attr,
 };
 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
 					 struct kobj_attribute *attr,
 					 char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
 }
 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
 					  struct kobj_attribute *attr,
 					  const char *buf, size_t count)
 {
 	unsigned long msecs;
 	int err;
 	err = strict_strtoul(buf, 10, &msecs);
 	if (err || msecs > UINT_MAX)
 		return -EINVAL;
 	khugepaged_scan_sleep_millisecs = msecs;
 	wake_up_interruptible(&khugepaged_wait);
 	return count;
 }
 static struct kobj_attribute scan_sleep_millisecs_attr =
 	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
 	       scan_sleep_millisecs_store);
 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
 					  struct kobj_attribute *attr,
 					  char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
 }
 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
 					   struct kobj_attribute *attr,
 					   const char *buf, size_t count)
 {
 	unsigned long msecs;
 	int err;
 	err = strict_strtoul(buf, 10, &msecs);
 	if (err || msecs > UINT_MAX)
 		return -EINVAL;
 	khugepaged_alloc_sleep_millisecs = msecs;
 	wake_up_interruptible(&khugepaged_wait);
 	return count;
 }
 static struct kobj_attribute alloc_sleep_millisecs_attr =
 	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
 	       alloc_sleep_millisecs_store);
 static ssize_t pages_to_scan_show(struct kobject *kobj,
 				  struct kobj_attribute *attr,
 				  char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
 }
 static ssize_t pages_to_scan_store(struct kobject *kobj,
 				   struct kobj_attribute *attr,
 				   const char *buf, size_t count)
 {
 	int err;
 	unsigned long pages;
 	err = strict_strtoul(buf, 10, &pages);
 	if (err || !pages || pages > UINT_MAX)
 		return -EINVAL;
 	khugepaged_pages_to_scan = pages;
 	return count;
 }
 static struct kobj_attribute pages_to_scan_attr =
 	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
 	       pages_to_scan_store);
 static ssize_t pages_collapsed_show(struct kobject *kobj,
 				    struct kobj_attribute *attr,
 				    char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
 }
 static struct kobj_attribute pages_collapsed_attr =
 	__ATTR_RO(pages_collapsed);
 static ssize_t full_scans_show(struct kobject *kobj,
 			       struct kobj_attribute *attr,
 			       char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_full_scans);
 }
 static struct kobj_attribute full_scans_attr =
 	__ATTR_RO(full_scans);
 static ssize_t khugepaged_defrag_show(struct kobject *kobj,
 				      struct kobj_attribute *attr, char *buf)
 {
 	return single_flag_show(kobj, attr, buf,
 				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 }
 static ssize_t khugepaged_defrag_store(struct kobject *kobj,
 				       struct kobj_attribute *attr,
 				       const char *buf, size_t count)
 {
 	return single_flag_store(kobj, attr, buf, count,
 				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 }
 static struct kobj_attribute khugepaged_defrag_attr =
 	__ATTR(defrag, 0644, khugepaged_defrag_show,
 	       khugepaged_defrag_store);
 /*
  * max_ptes_none controls if khugepaged should collapse hugepages over
  * any unmapped ptes in turn potentially increasing the memory
  * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
  * reduce the available free memory in the system as it
  * runs. Increasing max_ptes_none will instead potentially reduce the
  * free memory in the system during the khugepaged scan.
  */
 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
 					     struct kobj_attribute *attr,
 					     char *buf)
 {
 	return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
 }
 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
 					      struct kobj_attribute *attr,
 					      const char *buf, size_t count)
 {
 	int err;
 	unsigned long max_ptes_none;
 	err = strict_strtoul(buf, 10, &max_ptes_none);
 	if (err || max_ptes_none > HPAGE_PMD_NR-1)
 		return -EINVAL;
 	khugepaged_max_ptes_none = max_ptes_none;
 	return count;
 }
 static struct kobj_attribute khugepaged_max_ptes_none_attr =
 	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
 	       khugepaged_max_ptes_none_store);
 static struct attribute *khugepaged_attr[] = {
 	&khugepaged_defrag_attr.attr,
 	&khugepaged_max_ptes_none_attr.attr,
 	&pages_to_scan_attr.attr,
 	&pages_collapsed_attr.attr,
 	&full_scans_attr.attr,
 	&scan_sleep_millisecs_attr.attr,
 	&alloc_sleep_millisecs_attr.attr,
 	NULL,
 };
 static struct attribute_group khugepaged_attr_group = {
 	.attrs = khugepaged_attr,
 	.name = "khugepaged",
 };
 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 {
 	int err;
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
 		printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
 		return -ENOMEM;
 	}
 	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
 	if (err) {
 		printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
 		goto delete_obj;
 	}
 	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
 	if (err) {
 		printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
 		goto remove_hp_group;
 	}
 	return 0;
 remove_hp_group:
 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 delete_obj:
 	kobject_put(*hugepage_kobj);
 	return err;
 }
 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 {
 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 	kobject_put(hugepage_kobj);
 }
 #else
 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
 {
 	return 0;
 }
 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 {
 }
 #endif /* CONFIG_SYSFS */
 static int __init hugepage_init(void)
 {
 	int err;
 	struct kobject *hugepage_kobj;
 	if (!has_transparent_hugepage()) {
 		transparent_hugepage_flags = 0;
 		return -EINVAL;
 	}
 	err = hugepage_init_sysfs(&hugepage_kobj);
 	if (err)
 		return err;
 	err = khugepaged_slab_init();
 	if (err)
 		goto out;
 	register_shrinker(&huge_zero_page_shrinker);
 	/*
 	 * By default disable transparent hugepages on smaller systems,
 	 * where the extra memory used could hurt more than TLB overhead
 	 * is likely to save.  The admin can still enable it through /sys.
 	 */
 	if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
 		transparent_hugepage_flags = 0;
 	start_khugepaged();
 	return 0;
 out:
 	hugepage_exit_sysfs(hugepage_kobj);
 	return err;
 }
 module_init(hugepage_init)
 static int __init setup_transparent_hugepage(char *str)
 {
 	int ret = 0;
 	if (!str)
 		goto out;
 	if (!strcmp(str, "always")) {
 		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 			&transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			  &transparent_hugepage_flags);
 		ret = 1;
 	} else if (!strcmp(str, "madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 			  &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			&transparent_hugepage_flags);
 		ret = 1;
 	} else if (!strcmp(str, "never")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 			  &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			  &transparent_hugepage_flags);
 		ret = 1;
 	}
 out:
 	if (!ret)
 		printk(KERN_WARNING
 		       "transparent_hugepage= cannot parse, ignored\n");
 	return ret;
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
 		pmd = pmd_mkwrite(pmd);
 	return pmd;
 }
 static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
 {
 	pmd_t entry;
 	entry = mk_pmd(page, vma->vm_page_prot);
 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	entry = pmd_mkhuge(entry);
 	return entry;
 }
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
 					struct page *page)
 {
 	pgtable_t pgtable;
 	VM_BUG_ON(!PageCompound(page));
 	pgtable = pte_alloc_one(mm, haddr);
 	if (unlikely(!pgtable))
 		return VM_FAULT_OOM;
 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * clear_huge_page writes become visible before the set_pmd_at()
+	 * write.
+	 */
 	__SetPageUptodate(page);
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_none(*pmd))) {
 		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(page);
 		put_page(page);
 		pte_free(mm, pgtable);
 	} else {
 		pmd_t entry;
 		entry = mk_huge_pmd(page, vma);
-		/*
-		 * The spinlocking to take the lru_lock inside
-		 * page_add_new_anon_rmap() acts as a full memory
-		 * barrier to be sure clear_huge_page writes become
-		 * visible after the set_pmd_at() write.
-		 */
 		page_add_new_anon_rmap(page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
 		pgtable_trans_huge_deposit(mm, pgtable);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm->nr_ptes++;
 		spin_unlock(&mm->page_table_lock);
 	}
 	return 0;
 }
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 {
 	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
 					      unsigned long haddr, int nd,
 					      gfp_t extra_gfp)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
 			       HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 #ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
 	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
 			   HPAGE_PMD_ORDER);
 }
 #endif
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 		unsigned long zero_pfn)
 {
 	pmd_t entry;
 	if (!pmd_none(*pmd))
 		return false;
 	entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
 	entry = pmd_wrprotect(entry);
 	entry = pmd_mkhuge(entry);
 	set_pmd_at(mm, haddr, pmd, entry);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	mm->nr_ptes++;
 	return true;
 }
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       unsigned int flags)
 {
 	struct page *page;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	pte_t *pte;
 	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
 		if (unlikely(anon_vma_prepare(vma)))
 			return VM_FAULT_OOM;
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
 		if (!(flags & FAULT_FLAG_WRITE) &&
 				transparent_hugepage_use_zero_page()) {
 			pgtable_t pgtable;
 			unsigned long zero_pfn;
 			bool set;
 			pgtable = pte_alloc_one(mm, haddr);
 			if (unlikely(!pgtable))
 				return VM_FAULT_OOM;
 			zero_pfn = get_huge_zero_page();
 			if (unlikely(!zero_pfn)) {
 				pte_free(mm, pgtable);
 				count_vm_event(THP_FAULT_FALLBACK);
 				goto out;
 			}
 			spin_lock(&mm->page_table_lock);
 			set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
 					zero_pfn);
 			spin_unlock(&mm->page_table_lock);
 			if (!set) {
 				pte_free(mm, pgtable);
 				put_huge_zero_page();
 			}
 			return 0;
 		}
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 					  vma, haddr, numa_node_id(), 0);
 		if (unlikely(!page)) {
 			count_vm_event(THP_FAULT_FALLBACK);
 			goto out;
 		}
 		count_vm_event(THP_FAULT_ALLOC);
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
 			put_page(page);
 			goto out;
 		}
 		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
 							  page))) {
 			mem_cgroup_uncharge_page(page);
 			put_page(page);
 			goto out;
 		}
 		return 0;
 	}
 out:
 	/*
 	 * Use __pte_alloc instead of pte_alloc_map, because we can't
 	 * run pte_offset_map on the pmd, if an huge pmd could
 	 * materialize from under us from a different thread.
 	 */
 	if (unlikely(pmd_none(*pmd)) &&
 	    unlikely(__pte_alloc(mm, vma, pmd, address)))
 		return VM_FAULT_OOM;
 	/* if an huge pmd materialized from under us just retry later */
 	if (unlikely(pmd_trans_huge(*pmd)))
 		return 0;
 	/*
 	 * A regular pmd is established and it can't morph into a huge pmd
 	 * from under us anymore at this point because we hold the mmap_sem
 	 * read mode and khugepaged takes it in write mode. So now it's
 	 * safe to run pte_offset_map().
 	 */
 	pte = pte_offset_map(pmd, address);
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *vma)
 {
 	struct page *src_page;
 	pmd_t pmd;
 	pgtable_t pgtable;
 	int ret;
 	ret = -ENOMEM;
 	pgtable = pte_alloc_one(dst_mm, addr);
 	if (unlikely(!pgtable))
 		goto out;
 	spin_lock(&dst_mm->page_table_lock);
 	spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 	if (unlikely(!pmd_trans_huge(pmd))) {
 		pte_free(dst_mm, pgtable);
 		goto out_unlock;
 	}
 	/*
 	 * mm->page_table_lock is enough to be sure that huge zero pmd is not
 	 * under splitting since we don't split the page itself, only pmd to
 	 * a page table.
 	 */
 	if (is_huge_zero_pmd(pmd)) {
 		unsigned long zero_pfn;
 		bool set;
 		/*
 		 * get_huge_zero_page() will never allocate a new page here,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
 		zero_pfn = get_huge_zero_page();
 		set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_pfn);
 		BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
 		ret = 0;
 		goto out_unlock;
 	}
 	if (unlikely(pmd_trans_splitting(pmd))) {
 		/* split huge page running from under us */
 		spin_unlock(&src_mm->page_table_lock);
 		spin_unlock(&dst_mm->page_table_lock);
 		pte_free(dst_mm, pgtable);
 		wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
 		goto out;
 	}
 	src_page = pmd_page(pmd);
 	VM_BUG_ON(!PageHead(src_page));
 	get_page(src_page);
 	page_dup_rmap(src_page);
 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 	pgtable_trans_huge_deposit(dst_mm, pgtable);
 	dst_mm->nr_ptes++;
 	ret = 0;
 out_unlock:
 	spin_unlock(&src_mm->page_table_lock);
 	spin_unlock(&dst_mm->page_table_lock);
 out:
 	return ret;
 }
 void huge_pmd_set_accessed(struct mm_struct *mm,
 			   struct vm_area_struct *vma,
 			   unsigned long address,
 			   pmd_t *pmd, pmd_t orig_pmd,
 			   int dirty)
 {
 	pmd_t entry;
 	unsigned long haddr;
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto unlock;
 	entry = pmd_mkyoung(orig_pmd);
 	haddr = address & HPAGE_PMD_MASK;
 	if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
 		update_mmu_cache_pmd(vma, address, pmd);
 unlock:
 	spin_unlock(&mm->page_table_lock);
 }
 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
 {
 	pgtable_t pgtable;
 	pmd_t _pmd;
 	struct page *page;
 	int i, ret = 0;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (!page) {
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
 	clear_user_highpage(page, address);
 	__SetPageUptodate(page);
 	mmun_start = haddr;
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_page;
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 	pgtable = pgtable_trans_huge_withdraw(mm);
 	pmd_populate(mm, &_pmd, pgtable);
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 		pte_t *pte, entry;
 		if (haddr == (address & PAGE_MASK)) {
 			entry = mk_pte(page, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			page_add_new_anon_rmap(page, vma, haddr);
 		} else {
 			entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
 			entry = pte_mkspecial(entry);
 		}
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);
 		pte_unmap(pte);
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 	spin_unlock(&mm->page_table_lock);
 	put_huge_zero_page();
 	inc_mm_counter(mm, MM_ANONPAGES);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	ret |= VM_FAULT_WRITE;
 out:
 	return ret;
 out_free_page:
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_page(page);
 	put_page(page);
 	goto out;
 }
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long address,
 					pmd_t *pmd, pmd_t orig_pmd,
 					struct page *page,
 					unsigned long haddr)
 {
 	pgtable_t pgtable;
 	pmd_t _pmd;
 	int ret = 0, i;
 	struct page **pages;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
 			GFP_KERNEL);
 	if (unlikely(!pages)) {
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
 					       __GFP_OTHER_NODE,
 					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
 			     mem_cgroup_newpage_charge(pages[i], mm,
 						       GFP_KERNEL))) {
 			if (pages[i])
 				put_page(pages[i]);
 			mem_cgroup_uncharge_start();
 			while (--i >= 0) {
 				mem_cgroup_uncharge_page(pages[i]);
 				put_page(pages[i]);
 			}
 			mem_cgroup_uncharge_end();
 			kfree(pages);
 			ret |= VM_FAULT_OOM;
 			goto out;
 		}
 	}
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		copy_user_highpage(pages[i], page + i,
 				   haddr + PAGE_SIZE * i, vma);
 		__SetPageUptodate(pages[i]);
 		cond_resched();
 	}
 	mmun_start = haddr;
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_pages;
 	VM_BUG_ON(!PageHead(page));
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 	pgtable = pgtable_trans_huge_withdraw(mm);
 	pmd_populate(mm, &_pmd, pgtable);
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 		pte_t *pte, entry;
 		entry = mk_pte(pages[i], vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		page_add_new_anon_rmap(pages[i], vma, haddr);
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);
 		pte_unmap(pte);
 	}
 	kfree(pages);
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 	page_remove_rmap(page);
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	ret |= VM_FAULT_WRITE;
 	put_page(page);
 out:
 	return ret;
 out_free_pages:
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
 		mem_cgroup_uncharge_page(pages[i]);
 		put_page(pages[i]);
 	}
 	mem_cgroup_uncharge_end();
 	kfree(pages);
 	goto out;
 }
 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
 	int ret = 0;
 	struct page *page = NULL, *new_page;
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	VM_BUG_ON(!vma->anon_vma);
 	haddr = address & HPAGE_PMD_MASK;
 	if (is_huge_zero_pmd(orig_pmd))
 		goto alloc;
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_unlock;
 	page = pmd_page(orig_pmd);
 	VM_BUG_ON(!PageCompound(page) || !PageHead(page));
 	if (page_mapcount(page) == 1) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
 			update_mmu_cache_pmd(vma, address, pmd);
 		ret |= VM_FAULT_WRITE;
 		goto out_unlock;
 	}
 	get_page(page);
 	spin_unlock(&mm->page_table_lock);
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 					      vma, haddr, numa_node_id(), 0);
 	else
 		new_page = NULL;
 	if (unlikely(!new_page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
 					address, pmd, orig_pmd, haddr);
 		} else {
 			ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 					pmd, orig_pmd, page, haddr);
 			if (ret & VM_FAULT_OOM)
 				split_huge_page(page);
 			put_page(page);
 		}
 		goto out;
 	}
 	count_vm_event(THP_FAULT_ALLOC);
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
 			put_page(page);
 		}
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
 	if (is_huge_zero_pmd(orig_pmd))
 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
 	else
 		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 	mmun_start = haddr;
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	if (page)
 		put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
 		goto out_mn;
 	} else {
 		pmd_t entry;
 		entry = mk_huge_pmd(new_page, vma);
 		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
 		update_mmu_cache_pmd(vma, address, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 			put_huge_zero_page();
 		} else {
 			VM_BUG_ON(!PageHead(page));
 			page_remove_rmap(page);
 			put_page(page);
 		}
 		ret |= VM_FAULT_WRITE;
 	}
 	spin_unlock(&mm->page_table_lock);
 out_mn:
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return ret;
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr,
 				   pmd_t *pmd,
 				   unsigned int flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 	assert_spin_locked(&mm->page_table_lock);
 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
 		goto out;
 	/* Avoid dumping huge zero page */
 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
 		return ERR_PTR(-EFAULT);
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!PageHead(page));
 	if (flags & FOLL_TOUCH) {
 		pmd_t _pmd;
 		/*
 		 * We should set the dirty bit only for FOLL_WRITE but
 		 * for now the dirty bit in the pmd is meaningless.
 		 * And if the dirty bit will become meaningful and
 		 * we'll only set it with FOLL_WRITE, an atomic
 		 * set_bit will be required on the pmd to set the
 		 * young bit, instead of the current set_pmd_at.
 		 */
 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
 		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		if (page->mapping && trylock_page(page)) {
 			lru_add_drain();
 			if (page->mapping)
 				mlock_vma_page(page);
 			unlock_page(page);
 		}
 	}
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON(!PageCompound(page));
 	if (flags & FOLL_GET)
 		get_page_foll(page);
 out:
 	return page;
 }
 /* NUMA hinting page fault entry point for trans huge pmds */
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
 	struct page *page;
 	unsigned long haddr = addr & HPAGE_PMD_MASK;
 	int target_nid;
 	int current_nid = -1;
 	bool migrated;
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
 		goto out_unlock;
 	page = pmd_page(pmd);
 	get_page(page);
 	current_nid = page_to_nid(page);
 	count_vm_numa_event(NUMA_HINT_FAULTS);
 	if (current_nid == numa_node_id())
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 	target_nid = mpol_misplaced(page, vma, haddr);
 	if (target_nid == -1) {
 		put_page(page);
 		goto clear_pmdnuma;
 	}
 	/* Acquire the page lock to serialise THP migrations */
 	spin_unlock(&mm->page_table_lock);
 	lock_page(page);
 	/* Confirm the PTE did not while locked */
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp))) {
 		unlock_page(page);
 		put_page(page);
 		goto out_unlock;
 	}
 	spin_unlock(&mm->page_table_lock);
 	/* Migrate the THP to the requested node */
 	migrated = migrate_misplaced_transhuge_page(mm, vma,
 				pmdp, pmd, addr, page, target_nid);
 	if (!migrated)
 		goto check_same;
 	task_numa_fault(target_nid, HPAGE_PMD_NR, true);
 	return 0;
 check_same:
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
 		goto out_unlock;
 clear_pmdnuma:
 	pmd = pmd_mknonnuma(pmd);
 	set_pmd_at(mm, haddr, pmdp, pmd);
 	VM_BUG_ON(pmd_numa(*pmdp));
 	update_mmu_cache_pmd(vma, addr, pmdp);
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
 	if (current_nid != -1)
 		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
 	return 0;
 }
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
 	int ret = 0;
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
 		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			tlb->mm->nr_ptes--;
 			spin_unlock(&tlb->mm->page_table_lock);
 			put_huge_zero_page();
 		} else {
 			page = pmd_page(orig_pmd);
 			page_remove_rmap(page);
 			VM_BUG_ON(page_mapcount(page) < 0);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
 			tlb->mm->nr_ptes--;
 			spin_unlock(&tlb->mm->page_table_lock);
 			tlb_remove_page(tlb, page);
 		}
 		pte_free(tlb->mm, pgtable);
 		ret = 1;
 	}
 	return ret;
 }
 int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		unsigned char *vec)
 {
 	int ret = 0;
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		/*
 		 * All logical pages in the range are present
 		 * if backed by a huge page.
 		 */
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		memset(vec, 1, (end - addr) >> PAGE_SHIFT);
 		ret = 1;
 	}
 	return ret;
 }
 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		  unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	int ret = 0;
 	pmd_t pmd;
 	struct mm_struct *mm = vma->vm_mm;
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
 	    old_end - old_addr < HPAGE_PMD_SIZE ||
 	    (new_vma->vm_flags & VM_NOHUGEPAGE))
 		goto out;
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
 	 * should have release it.
 	 */
 	if (WARN_ON(!pmd_none(*new_pmd))) {
 		VM_BUG_ON(pmd_trans_huge(*new_pmd));
 		goto out;
 	}
 	ret = __pmd_trans_huge_lock(old_pmd, vma);
 	if (ret == 1) {
 		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
 		set_pmd_at(mm, new_addr, new_pmd, pmd);
 		spin_unlock(&mm->page_table_lock);
 	}
 out:
 	return ret;
 }
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot, int prot_numa)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int ret = 0;
 	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
 		pmd_t entry;
 		entry = pmdp_get_and_clear(mm, addr, pmd);
 		if (!prot_numa) {
 			entry = pmd_modify(entry, newprot);
 			BUG_ON(pmd_write(entry));
 		} else {
 			struct page *page = pmd_page(*pmd);
 			/* only check non-shared pages */
 			if (page_mapcount(page) == 1 &&
 			    !pmd_numa(*pmd)) {
 				entry = pmd_mknuma(entry);
 			}
 		}
 		set_pmd_at(mm, addr, pmd, entry);
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		ret = 1;
 	}
 	return ret;
 }
 /*
  * Returns 1 if a given pmd maps a stable (not under splitting) thp.
  * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
  *
  * Note that if it returns 1, this routine returns without unlocking page
  * table locks. So callers must unlock them.
  */
 int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
 	spin_lock(&vma->vm_mm->page_table_lock);
 	if (likely(pmd_trans_huge(*pmd))) {
 		if (unlikely(pmd_trans_splitting(*pmd))) {
 			spin_unlock(&vma->vm_mm->page_table_lock);
 			wait_split_huge_page(vma->anon_vma, pmd);
 			return -1;
 		} else {
 			/* Thp mapped by 'pmd' is stable, so we can
 			 * handle it as it is. */
 			return 1;
 		}
 	}
 	spin_unlock(&vma->vm_mm->page_table_lock);
 	return 0;
 }
 pmd_t *page_check_address_pmd(struct page *page,
 			      struct mm_struct *mm,
 			      unsigned long address,
 			      enum page_check_address_pmd_flag flag)
 {
 	pmd_t *pmd, *ret = NULL;
 	if (address & ~HPAGE_PMD_MASK)
 		goto out;
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		goto out;
 	if (pmd_none(*pmd))
 		goto out;
 	if (pmd_page(*pmd) != page)
 		goto out;
 	/*
 	 * split_vma() may create temporary aliased mappings. There is
 	 * no risk as long as all huge pmd are found and have their
 	 * splitting bit set before __split_huge_page_refcount
 	 * runs. Finding the same huge pmd more than once during the
 	 * same rmap walk is not a problem.
 	 */
 	if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
 	    pmd_trans_splitting(*pmd))
 		goto out;
 	if (pmd_trans_huge(*pmd)) {
 		VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
 			  !pmd_trans_splitting(*pmd));
 		ret = pmd;
 	}
 out:
 	return ret;
 }
 static int __split_huge_page_splitting(struct page *page,
 				       struct vm_area_struct *vma,
 				       unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t *pmd;
 	int ret = 0;
 	/* For mmu_notifiers */
 	const unsigned long mmun_start = address;
 	const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
 	if (pmd) {
 		/*
 		 * We can't temporarily set the pmd to null in order
 		 * to split it, the pmd must remain marked huge at all
 		 * times or the VM won't take the pmd_trans_huge paths
 		 * and it won't wait on the anon_vma->root->rwsem to
 		 * serialize against split_huge_page*.
 		 */
 		pmdp_splitting_flush(vma, address, pmd);
 		ret = 1;
 	}
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	return ret;
 }
 static void __split_huge_page_refcount(struct page *page)
 {
 	int i;
 	struct zone *zone = page_zone(page);
 	struct lruvec *lruvec;
 	int tail_count = 0;
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irq(&zone->lru_lock);
 	lruvec = mem_cgroup_page_lruvec(page, zone);
 	compound_lock(page);
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(page);
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		struct page *page_tail = page + i;
 		/* tail_page->_mapcount cannot change */
 		BUG_ON(page_mapcount(page_tail) < 0);
 		tail_count += page_mapcount(page_tail);
 		/* check for overflow */
 		BUG_ON(tail_count < 0);
 		BUG_ON(atomic_read(&page_tail->_count) != 0);
 		/*
 		 * tail_page->_count is zero and not changing from
 		 * under us. But get_page_unless_zero() may be running
 		 * from under us on the tail_page. If we used
 		 * atomic_set() below instead of atomic_add(), we
 		 * would then run atomic_set() concurrently with
 		 * get_page_unless_zero(), and atomic_set() is
 		 * implemented in C not using locked ops. spin_unlock
 		 * on x86 sometime uses locked ops because of PPro
 		 * errata 66, 92, so unless somebody can guarantee
 		 * atomic_set() here would be safe on all archs (and
 		 * not only on x86), it's safer to use atomic_add().
 		 */
 		atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
 			   &page_tail->_count);
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb();
 		/*
 		 * retain hwpoison flag of the poisoned tail page:
 		 *   fix for the unsuitable process killed on Guest Machine(KVM)
 		 *   by the memory-failure.
 		 */
 		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
 		page_tail->flags |= (page->flags &
 				     ((1L << PG_referenced) |
 				      (1L << PG_swapbacked) |
 				      (1L << PG_mlocked) |
 				      (1L << PG_uptodate)));
 		page_tail->flags |= (1L << PG_dirty);
 		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 		/*
 		 * __split_huge_page_splitting() already set the
 		 * splitting bit in all pmd that could map this
 		 * hugepage, that will ensure no CPU can alter the
 		 * mapcount on the head page. The mapcount is only
 		 * accounted in the head page and it has to be
 		 * transferred to all tail pages in the below code. So
 		 * for this code to be safe, the split the mapcount
 		 * can't change. But that doesn't mean userland can't
 		 * keep changing and reading the page contents while
 		 * we transfer the mapcount, so the pmd splitting
 		 * status is achieved setting a reserved bit in the
 		 * pmd, not by clearing the present bit.
 		*/
 		page_tail->_mapcount = page->_mapcount;
 		BUG_ON(page_tail->mapping);
 		page_tail->mapping = page->mapping;
 		page_tail->index = page->index + i;
 		page_nid_xchg_last(page_tail, page_nid_last(page));
 		BUG_ON(!PageAnon(page_tail));
 		BUG_ON(!PageUptodate(page_tail));
 		BUG_ON(!PageDirty(page_tail));
 		BUG_ON(!PageSwapBacked(page_tail));
 		lru_add_page_tail(page, page_tail, lruvec);
 	}
 	atomic_sub(tail_count, &page->_count);
 	BUG_ON(atomic_read(&page->_count) <= 0);
 	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
 	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 	ClearPageCompound(page);
 	compound_unlock(page);
 	spin_unlock_irq(&zone->lru_lock);
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		struct page *page_tail = page + i;
 		BUG_ON(page_count(page_tail) <= 0);
 		/*
 		 * Tail pages may be freed if there wasn't any mapping
 		 * like if add_to_swap() is running on a lru page that
 		 * had its mapping zapped. And freeing these pages
 		 * requires taking the lru_lock so we do the put_page
 		 * of the tail pages after the split is complete.
 		 */
 		put_page(page_tail);
 	}
 	/*
 	 * Only the head page (now become a regular page) is required
 	 * to be pinned by the caller.
 	 */
 	BUG_ON(page_count(page) <= 0);
 }
 static int __split_huge_page_map(struct page *page,
 				 struct vm_area_struct *vma,
 				 unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t *pmd, _pmd;
 	int ret = 0, i;
 	pgtable_t pgtable;
 	unsigned long haddr;
 	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
 	if (pmd) {
 		pgtable = pgtable_trans_huge_withdraw(mm);
 		pmd_populate(mm, &_pmd, pgtable);
 		haddr = address;
 		for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 			pte_t *pte, entry;
 			BUG_ON(PageCompound(page+i));
 			entry = mk_pte(page + i, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			if (!pmd_write(*pmd))
 				entry = pte_wrprotect(entry);
 			else
 				BUG_ON(page_mapcount(page) != 1);
 			if (!pmd_young(*pmd))
 				entry = pte_mkold(entry);
 			if (pmd_numa(*pmd))
 				entry = pte_mknuma(entry);
 			pte = pte_offset_map(&_pmd, haddr);
 			BUG_ON(!pte_none(*pte));
 			set_pte_at(mm, haddr, pte, entry);
 			pte_unmap(pte);
 		}
 		smp_wmb(); /* make pte visible before pmd */
 		/*
 		 * Up to this point the pmd is present and huge and
 		 * userland has the whole access to the hugepage
 		 * during the split (which happens in place). If we
 		 * overwrite the pmd with the not-huge version
 		 * pointing to the pte here (which of course we could
 		 * if all CPUs were bug free), userland could trigger
 		 * a small page size TLB miss on the small sized TLB
 		 * while the hugepage TLB entry is still established
 		 * in the huge TLB. Some CPU doesn't like that. See
 		 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
 		 * Erratum 383 on page 93. Intel should be safe but is
 		 * also warns that it's only safe if the permission
 		 * and cache attributes of the two entries loaded in
 		 * the two TLB is identical (which should be the case
 		 * here). But it is generally safer to never allow
 		 * small and huge TLB entries for the same virtual
 		 * address to be loaded simultaneously. So instead of
 		 * doing "pmd_populate(); flush_tlb_range();" we first
 		 * mark the current pmd notpresent (atomically because
 		 * here the pmd_trans_huge and pmd_trans_splitting
 		 * must remain set at all times on the pmd until the
 		 * split is complete for this pmd), then we flush the
 		 * SMP TLB and finally we write the non-huge version
 		 * of the pmd entry with pmd_populate.
 		 */
 		pmdp_invalidate(vma, address, pmd);
 		pmd_populate(mm, pmd, pgtable);
 		ret = 1;
 	}
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 /* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
 			      struct anon_vma *anon_vma)
 {
 	int mapcount, mapcount2;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma_chain *avc;
 	BUG_ON(!PageHead(page));
 	BUG_ON(PageTail(page));
 	mapcount = 0;
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
 		mapcount += __split_huge_page_splitting(page, vma, addr);
 	}
 	/*
 	 * It is critical that new vmas are added to the tail of the
 	 * anon_vma list. This guarantes that if copy_huge_pmd() runs
 	 * and establishes a child pmd before
 	 * __split_huge_page_splitting() freezes the parent pmd (so if
 	 * we fail to prevent copy_huge_pmd() from running until the
 	 * whole __split_huge_page() is complete), we will still see
 	 * the newly established pmd of the child later during the
 	 * walk, to be able to set it as pmd_trans_splitting too.
 	 */
 	if (mapcount != page_mapcount(page))
 		printk(KERN_ERR "mapcount %d page_mapcount %d\n",
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 	__split_huge_page_refcount(page);
 	mapcount2 = 0;
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
 		BUG_ON(is_vma_temporary_stack(vma));
 		mapcount2 += __split_huge_page_map(page, vma, addr);
 	}
 	if (mapcount != mapcount2)
 		printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
 		       mapcount, mapcount2, page_mapcount(page));
 	BUG_ON(mapcount != mapcount2);
 }
 int split_huge_page(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
 	BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
 	BUG_ON(!PageAnon(page));
 	/*
 	 * The caller does not necessarily hold an mmap_sem that would prevent
 	 * the anon_vma disappearing so we first we take a reference to it
 	 * and then lock the anon_vma for write. This is similar to
 	 * page_lock_anon_vma_read except the write lock is taken to serialise
 	 * against parallel split or collapse operations.
 	 */
 	anon_vma = page_get_anon_vma(page);
 	if (!anon_vma)
 		goto out;
 	anon_vma_lock_write(anon_vma);
 	ret = 0;
 	if (!PageCompound(page))
 		goto out_unlock;
 	BUG_ON(!PageSwapBacked(page));
 	__split_huge_page(page, anon_vma);
 	count_vm_event(THP_SPLIT);
 	BUG_ON(PageCompound(page));
 out_unlock:
 	anon_vma_unlock_write(anon_vma);
 	put_anon_vma(anon_vma);
 out:
 	return ret;
 }
 #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 int hugepage_madvise(struct vm_area_struct *vma,
 		     unsigned long *vm_flags, int advice)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	switch (advice) {
 	case MADV_HUGEPAGE:
 		/*
 		 * Be somewhat over-protective like KSM for now!
 		 */
 		if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
 			return -EINVAL;
 		if (mm->def_flags & VM_NOHUGEPAGE)
 			return -EINVAL;
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
 		/*
 		 * If the vma become good for khugepaged to scan,
 		 * register it here without waiting a page fault that
 		 * may not happen any time soon.
 		 */
 		if (unlikely(khugepaged_enter_vma_merge(vma)))
 			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
 		/*
 		 * Be somewhat over-protective like KSM for now!
 		 */
 		if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
 			return -EINVAL;
 		*vm_flags &= ~VM_HUGEPAGE;
 		*vm_flags |= VM_NOHUGEPAGE;
 		/*
 		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
 		 * this vma even if we leave the mm registered in khugepaged if
 		 * it got registered before VM_NOHUGEPAGE was set.
 		 */
 		break;
 	}
 	return 0;
 }
 static int __init khugepaged_slab_init(void)
 {
 	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
 					  sizeof(struct mm_slot),
 					  __alignof__(struct mm_slot), 0, NULL);
 	if (!mm_slot_cache)
 		return -ENOMEM;
 	return 0;
 }
 static inline struct mm_slot *alloc_mm_slot(void)
 {
 	if (!mm_slot_cache)	/* initialization failed */
 		return NULL;
 	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
 }
 static inline void free_mm_slot(struct mm_slot *mm_slot)
 {
 	kmem_cache_free(mm_slot_cache, mm_slot);
 }
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
 	hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
 		if (mm == mm_slot->mm)
 			return mm_slot;
 	return NULL;
 }
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
 				    struct mm_slot *mm_slot)
 {
 	mm_slot->mm = mm;
 	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
 }
 static inline int khugepaged_test_exit(struct mm_struct *mm)
 {
 	return atomic_read(&mm->mm_users) == 0;
 }
 int __khugepaged_enter(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
 	int wakeup;
 	mm_slot = alloc_mm_slot();
 	if (!mm_slot)
 		return -ENOMEM;
 	/* __khugepaged_exit() must not run from under us */
 	VM_BUG_ON(khugepaged_test_exit(mm));
 	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
 		free_mm_slot(mm_slot);
 		return 0;
 	}
 	spin_lock(&khugepaged_mm_lock);
 	insert_to_mm_slots_hash(mm, mm_slot);
 	/*
 	 * Insert just behind the scanning cursor, to let the area settle
 	 * down a little.
 	 */
 	wakeup = list_empty(&khugepaged_scan.mm_head);
 	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
 	spin_unlock(&khugepaged_mm_lock);
 	atomic_inc(&mm->mm_count);
 	if (wakeup)
 		wake_up_interruptible(&khugepaged_wait);
 	return 0;
 }
 int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 {
 	unsigned long hstart, hend;
 	if (!vma->anon_vma)
 		/*
 		 * Not yet faulted in so we will register later in the
 		 * page fault if needed.
 		 */
 		return 0;
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
 	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (hstart < hend)
 		return khugepaged_enter(vma);
 	return 0;
 }
 void __khugepaged_exit(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
 	int free = 0;
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = get_mm_slot(mm);
 	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
 		hash_del(&mm_slot->hash);
 		list_del(&mm_slot->mm_node);
 		free = 1;
 	}
 	spin_unlock(&khugepaged_mm_lock);
 	if (free) {
 		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
 		free_mm_slot(mm_slot);
 		mmdrop(mm);
 	} else if (mm_slot) {
 		/*
 		 * This is required to serialize against
 		 * khugepaged_test_exit() (which is guaranteed to run
 		 * under mmap sem read mode). Stop here (after we
 		 * return all pagetables will be destroyed) until
 		 * khugepaged has finished working on the pagetables
 		 * under the mmap_sem.
 		 */
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
 	}
 }
 static void release_pte_page(struct page *page)
 {
 	/* 0 stands for page_is_file_cache(page) == false */
 	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
 	unlock_page(page);
 	putback_lru_page(page);
 }
 static void release_pte_pages(pte_t *pte, pte_t *_pte)
 {
 	while (--_pte >= pte) {
 		pte_t pteval = *_pte;
 		if (!pte_none(pteval))
 			release_pte_page(pte_page(pteval));
 	}
 }
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte)
 {
 	struct page *page;
 	pte_t *_pte;
 	int referenced = 0, none = 0;
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval)) {
 			if (++none <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out;
 		}
 		if (!pte_present(pteval) || !pte_write(pteval))
 			goto out;
 		page = vm_normal_page(vma, address, pteval);
 		if (unlikely(!page))
 			goto out;
 		VM_BUG_ON(PageCompound(page));
 		BUG_ON(!PageAnon(page));
 		VM_BUG_ON(!PageSwapBacked(page));
 		/* cannot use mapcount: can't collapse if there's a gup pin */
 		if (page_count(page) != 1)
 			goto out;
 		/*
 		 * We can do it before isolate_lru_page because the
 		 * page can't be freed from under us. NOTE: PG_lock
 		 * is needed to serialize against split_huge_page
 		 * when invoked from the VM.
 		 */
 		if (!trylock_page(page))
 			goto out;
 		/*
 		 * Isolate the page to avoid collapsing an hugepage
 		 * currently in use by the VM.
 		 */
 		if (isolate_lru_page(page)) {
 			unlock_page(page);
 			goto out;
 		}
 		/* 0 stands for page_is_file_cache(page) == false */
 		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
 		VM_BUG_ON(!PageLocked(page));
 		VM_BUG_ON(PageLRU(page));
 		/* If there is no mapped pte young don't collapse the page */
 		if (pte_young(pteval) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
 	if (likely(referenced))
 		return 1;
 out:
 	release_pte_pages(pte, _pte);
 	return 0;
 }
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 				      struct vm_area_struct *vma,
 				      unsigned long address,
 				      spinlock_t *ptl)
 {
 	pte_t *_pte;
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
 		pte_t pteval = *_pte;
 		struct page *src_page;
 		if (pte_none(pteval)) {
 			clear_user_highpage(page, address);
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
 		} else {
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
 			VM_BUG_ON(page_mapcount(src_page) != 1);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
 			 * be disabled to update the per-cpu stats
 			 * inside page_remove_rmap().
 			 */
 			spin_lock(ptl);
 			/*
 			 * paravirt calls inside pte_clear here are
 			 * superfluous.
 			 */
 			pte_clear(vma->vm_mm, address, _pte);
 			page_remove_rmap(src_page);
 			spin_unlock(ptl);
 			free_page_and_swap_cache(src_page);
 		}
 		address += PAGE_SIZE;
 		page++;
 	}
 }
 static void khugepaged_alloc_sleep(void)
 {
 	wait_event_freezable_timeout(khugepaged_wait, false,
 			msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 }
 #ifdef CONFIG_NUMA
 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 {
 	if (IS_ERR(*hpage)) {
 		if (!*wait)
 			return false;
 		*wait = false;
 		*hpage = NULL;
 		khugepaged_alloc_sleep();
 	} else if (*hpage) {
 		put_page(*hpage);
 		*hpage = NULL;
 	}
 	return true;
 }
 static struct page
 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	VM_BUG_ON(*hpage);
 	/*
 	 * Allocate the page while the vma is still valid and under
 	 * the mmap_sem read mode so there is no memory allocation
 	 * later when we take the mmap_sem in write mode. This is more
 	 * friendly behavior (OTOH it may actually hide bugs) to
 	 * filesystems in userland with daemons allocating memory in
 	 * the userland I/O paths.  Allocating memory with the
 	 * mmap_sem in read mode is good idea also to allow greater
 	 * scalability.
 	 */
 	*hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
 				      node, __GFP_OTHER_NODE);
 	/*
 	 * After allocating the hugepage, release the mmap_sem read lock in
 	 * preparation for taking it in write mode.
 	 */
 	up_read(&mm->mmap_sem);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
 		return NULL;
 	}
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return *hpage;
 }
 #else
 static struct page *khugepaged_alloc_hugepage(bool *wait)
 {
 	struct page *hpage;
 	do {
 		hpage = alloc_hugepage(khugepaged_defrag());
 		if (!hpage) {
 			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 			if (!*wait)
 				return NULL;
 			*wait = false;
 			khugepaged_alloc_sleep();
 		} else
 			count_vm_event(THP_COLLAPSE_ALLOC);
 	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
 	return hpage;
 }
 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 {
 	if (!*hpage)
 		*hpage = khugepaged_alloc_hugepage(wait);
 	if (unlikely(!*hpage))
 		return false;
 	return true;
 }
 static struct page
 *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	up_read(&mm->mmap_sem);
 	VM_BUG_ON(!*hpage);
 	return  *hpage;
 }
 #endif
 static bool hugepage_vma_check(struct vm_area_struct *vma)
 {
 	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
 	    (vma->vm_flags & VM_NOHUGEPAGE))
 		return false;
 	if (!vma->anon_vma || vma->vm_ops)
 		return false;
 	if (is_vma_temporary_stack(vma))
 		return false;
 	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 	return true;
 }
 static void collapse_huge_page(struct mm_struct *mm,
 				   unsigned long address,
 				   struct page **hpage,
 				   struct vm_area_struct *vma,
 				   int node)
 {
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
 	pgtable_t pgtable;
 	struct page *new_page;
 	spinlock_t *ptl;
 	int isolated;
 	unsigned long hstart, hend;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	/* release the mmap_sem read lock. */
 	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
 	if (!new_page)
 		return;
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
 		return;
 	/*
 	 * Prevent all access to pagetables with the exception of
 	 * gup_fast later hanlded by the ptep_clear_flush and the VM
 	 * handled by the anon_vma lock + PG_lock.
 	 */
 	down_write(&mm->mmap_sem);
 	if (unlikely(khugepaged_test_exit(mm)))
 		goto out;
 	vma = find_vma(mm, address);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
 		goto out;
 	if (!hugepage_vma_check(vma))
 		goto out;
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		goto out;
 	if (pmd_trans_huge(*pmd))
 		goto out;
 	anon_vma_lock_write(vma->anon_vma);
 	pte = pte_offset_map(pmd, address);
 	ptl = pte_lockptr(mm, pmd);
 	mmun_start = address;
 	mmun_end   = address + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock); /* probably unnecessary */
 	/*
 	 * After this gup_fast can't run anymore. This also removes
 	 * any huge TLB entry from the CPU so we won't allow
 	 * huge and small TLB entries for the same virtual address
 	 * to avoid the risk of CPU bugs in that area.
 	 */
 	_pmd = pmdp_clear_flush(vma, address, pmd);
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	spin_lock(ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte);
 	spin_unlock(ptl);
 	if (unlikely(!isolated)) {
 		pte_unmap(pte);
 		spin_lock(&mm->page_table_lock);
 		BUG_ON(!pmd_none(*pmd));
 		set_pmd_at(mm, address, pmd, _pmd);
 		spin_unlock(&mm->page_table_lock);
 		anon_vma_unlock_write(vma->anon_vma);
 		goto out;
 	}
 	/*
 	 * All pages are isolated and locked so anon_vma rmap
 	 * can't run anymore.
 	 */
 	anon_vma_unlock_write(vma->anon_vma);
 	__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
 	pte_unmap(pte);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
 	_pmd = mk_huge_pmd(new_page, vma);
 	/*
 	 * spin_lock() below is not the equivalent of smp_wmb(), so
 	 * this is needed to avoid the copy_huge_page writes to become
 	 * visible after the set_pmd_at() write.
 	 */
 	smp_wmb();
 	spin_lock(&mm->page_table_lock);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
 	pgtable_trans_huge_deposit(mm, pgtable);
 	spin_unlock(&mm->page_table_lock);
 	*hpage = NULL;
 	khugepaged_pages_collapsed++;
 out_up_write:
 	up_write(&mm->mmap_sem);
 	return;
 out:
 	mem_cgroup_uncharge_page(new_page);
 	goto out_up_write;
 }
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 			       struct vm_area_struct *vma,
 			       unsigned long address,
 			       struct page **hpage)
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
 	int ret = 0, referenced = 0, none = 0;
 	struct page *page;
 	unsigned long _address;
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		goto out;
 	if (pmd_trans_huge(*pmd))
 		goto out;
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval)) {
 			if (++none <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out_unmap;
 		}
 		if (!pte_present(pteval) || !pte_write(pteval))
 			goto out_unmap;
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
 		/*
 		 * Chose the node of the first page. This could
 		 * be more sophisticated and look at more pages,
 		 * but isn't for now.
 		 */
 		if (node == NUMA_NO_NODE)
 			node = page_to_nid(page);
 		VM_BUG_ON(PageCompound(page));
 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
 			goto out_unmap;
 		/* cannot use mapcount: can't collapse if there's a gup pin */
 		if (page_count(page) != 1)
 			goto out_unmap;
 		if (pte_young(pteval) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
 	if (referenced)
 		ret = 1;
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret)
 		/* collapse_huge_page will return with the mmap_sem released */
 		collapse_huge_page(mm, address, hpage, vma, node);
 out:
 	return ret;
 }
 static void collect_mm_slot(struct mm_slot *mm_slot)
 {
 	struct mm_struct *mm = mm_slot->mm;
 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
 	if (khugepaged_test_exit(mm)) {
 		/* free mm_slot */
 		hash_del(&mm_slot->hash);
 		list_del(&mm_slot->mm_node);
 		/*
 		 * Not strictly needed because the mm exited already.
 		 *
 		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
 		 */
 		/* khugepaged_mm_lock actually not necessary for the below */
 		free_mm_slot(mm_slot);
 		mmdrop(mm);
 	}
 }
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 					    struct page **hpage)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
 {
 	struct mm_slot *mm_slot;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	int progress = 0;
 	VM_BUG_ON(!pages);
 	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
 	if (khugepaged_scan.mm_slot)
 		mm_slot = khugepaged_scan.mm_slot;
 	else {
 		mm_slot = list_entry(khugepaged_scan.mm_head.next,
 				     struct mm_slot, mm_node);
 		khugepaged_scan.address = 0;
 		khugepaged_scan.mm_slot = mm_slot;
 	}
 	spin_unlock(&khugepaged_mm_lock);
 	mm = mm_slot->mm;
 	down_read(&mm->mmap_sem);
 	if (unlikely(khugepaged_test_exit(mm)))
 		vma = NULL;
 	else
 		vma = find_vma(mm, khugepaged_scan.address);
 	progress++;
 	for (; vma; vma = vma->vm_next) {
 		unsigned long hstart, hend;
 		cond_resched();
 		if (unlikely(khugepaged_test_exit(mm))) {
 			progress++;
 			break;
 		}
 		if (!hugepage_vma_check(vma)) {
 skip:
 			progress++;
 			continue;
 		}
 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 		hend = vma->vm_end & HPAGE_PMD_MASK;
 		if (hstart >= hend)
 			goto skip;
 		if (khugepaged_scan.address > hend)
 			goto skip;
 		if (khugepaged_scan.address < hstart)
 			khugepaged_scan.address = hstart;
 		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
 		while (khugepaged_scan.address < hend) {
 			int ret;
 			cond_resched();
 			if (unlikely(khugepaged_test_exit(mm)))
 				goto breakouterloop;
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
 				  hend);
 			ret = khugepaged_scan_pmd(mm, vma,
 						  khugepaged_scan.address,
 						  hpage);
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
 			progress += HPAGE_PMD_NR;
 			if (ret)
 				/* we released mmap_sem so break loop */
 				goto breakouterloop_mmap_sem;
 			if (progress >= pages)
 				goto breakouterloop;
 		}
 	}
 breakouterloop:
 	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
 breakouterloop_mmap_sem:
 	spin_lock(&khugepaged_mm_lock);
 	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
 	/*
 	 * Release the current mm_slot if this mm is about to die, or
 	 * if we scanned all vmas of this mm.
 	 */
 	if (khugepaged_test_exit(mm) || !vma) {
 		/*
 		 * Make sure that if mm_users is reaching zero while
 		 * khugepaged runs here, khugepaged_exit will find
 		 * mm_slot not pointing to the exiting mm.
 		 */
 		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
 			khugepaged_scan.mm_slot = list_entry(
 				mm_slot->mm_node.next,
 				struct mm_slot, mm_node);
 			khugepaged_scan.address = 0;
 		} else {
 			khugepaged_scan.mm_slot = NULL;
 			khugepaged_full_scans++;
 		}
 		collect_mm_slot(mm_slot);
 	}
 	return progress;
 }
 static int khugepaged_has_work(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) &&
 		khugepaged_enabled();
 }
 static int khugepaged_wait_event(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) ||
 		kthread_should_stop();
 }
 static void khugepaged_do_scan(void)
 {
 	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = khugepaged_pages_to_scan;
 	bool wait = true;
 	barrier(); /* write khugepaged_pages_to_scan to local stack */
 	while (progress < pages) {
 		if (!khugepaged_prealloc_page(&hpage, &wait))
 			break;
 		cond_resched();
 		if (unlikely(kthread_should_stop() || freezing(current)))
 			break;
 		spin_lock(&khugepaged_mm_lock);
 		if (!khugepaged_scan.mm_slot)
 			pass_through_head++;
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
 							    &hpage);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
 	if (!IS_ERR_OR_NULL(hpage))
 		put_page(hpage);
 }
 static void khugepaged_wait_work(void)
 {
 	try_to_freeze();
 	if (khugepaged_has_work()) {
 		if (!khugepaged_scan_sleep_millisecs)
 			return;
 		wait_event_freezable_timeout(khugepaged_wait,
 					     kthread_should_stop(),
 			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
 		return;
 	}
 	if (khugepaged_enabled())
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
 	set_freezable();
 	set_user_nice(current, 19);
 	while (!kthread_should_stop()) {
 		khugepaged_do_scan();
 		khugepaged_wait_work();
 	}
 	spin_lock(&khugepaged_mm_lock);
 	mm_slot = khugepaged_scan.mm_slot;
 	khugepaged_scan.mm_slot = NULL;
 	if (mm_slot)
 		collect_mm_slot(mm_slot);
 	spin_unlock(&khugepaged_mm_lock);
 	return 0;
 }
 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 		unsigned long haddr, pmd_t *pmd)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgtable_t pgtable;
 	pmd_t _pmd;
 	int i;
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 	pgtable = pgtable_trans_huge_withdraw(mm);
 	pmd_populate(mm, &_pmd, pgtable);
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 		pte_t *pte, entry;
 		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
 		entry = pte_mkspecial(entry);
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);
 		pte_unmap(pte);
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 	put_huge_zero_page();
 }
 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd)
 {
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
 	mmun_start = haddr;
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_trans_huge(*pmd))) {
 		spin_unlock(&mm->page_table_lock);
 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 		return;
 	}
 	if (is_huge_zero_pmd(*pmd)) {
 		__split_huge_zero_page_pmd(vma, haddr, pmd);
 		spin_unlock(&mm->page_table_lock);
 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 		return;
 	}
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!page_count(page));
 	get_page(page);
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	split_huge_page(page);
 	put_page(page);
 	BUG_ON(pmd_trans_huge(*pmd));
 }
 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd)
 {
 	struct vm_area_struct *vma;
 	vma = find_vma(mm, address);
 	BUG_ON(vma == NULL);
 	split_huge_page_pmd(vma, address, pmd);
 }
 static void split_huge_page_address(struct mm_struct *mm,
 				    unsigned long address)
 {
 	pmd_t *pmd;
 	VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
 		return;
 	/*
 	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
 	 * materialize from under us.
 	 */
 	split_huge_page_pmd_mm(mm, address, pmd);
 }
 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 			     unsigned long start,
 			     unsigned long end,
 			     long adjust_next)
 {
 	/*
 	 * If the new start address isn't hpage aligned and it could
 	 * previously contain an hugepage: check if we need to split
 	 * an huge pmd.
 	 */
 	if (start & ~HPAGE_PMD_MASK &&
 	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
 	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
 		split_huge_page_address(vma->vm_mm, start);
 	/*
 	 * If the new end address isn't hpage aligned and it could
 	 * previously contain an hugepage: check if we need to split
 	 * an huge pmd.
 	 */
 	if (end & ~HPAGE_PMD_MASK &&
 	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
 	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
 		split_huge_page_address(vma->vm_mm, end);
 	/*
 	 * If we're also updating the vma->vm_next->vm_start, if the new
 	 * vm_next->vm_start isn't page aligned and it could previously
 	 * contain an hugepage: check if we need to split an huge pmd.
 	 */
 	if (adjust_next > 0) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long nstart = next->vm_start;
 		nstart += adjust_next << PAGE_SHIFT;
 		if (nstart & ~HPAGE_PMD_MASK &&
 		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&

 /*
  *  linux/mm/memory.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  */
 /*
  * demand-loading started 01.12.91 - seems it is high on the list of
  * things wanted, and it should be easy to implement. - Linus
  */
 /*
  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  * pages started 02.12.91, seems to work. - Linus.
  *
  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  * would have taken more than the 6M I have free, but it worked well as
  * far as I could see.
  *
  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  */
 /*
  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  * thought has to go into this. Oh, well..
  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  *		Found it. Everything seems to work now.
  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  */
 /*
  * 05.04.94  -  Multi-page memory management added for v1.1.
  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
  *
  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  *		(Gerhard.Wichert@pdb.siemens.de)
  *
  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
  */
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
 #include <linux/migrate.h>
 #include <linux/string.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 #include "internal.h"
 #ifdef LAST_NID_NOT_IN_PAGE_FLAGS
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
 #endif
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
 EXPORT_SYMBOL(max_mapnr);
 EXPORT_SYMBOL(mem_map);
 #endif
 unsigned long num_physpages;
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
  * that high_memory defines the upper bound on direct map memory, then end
  * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
  * and ZONE_HIGHMEM.
  */
 void * high_memory;
 EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 /*
  * Randomize the address space (stacks, mmaps, brk, etc.).
  *
  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
  *   as ancient (libc5 based) binaries can segfault. )
  */
 int randomize_va_space __read_mostly =
 #ifdef CONFIG_COMPAT_BRK
 					1;
 #else
 					2;
 #endif
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
 	return 1;
 }
 __setup("norandmaps", disable_randmaps);
 unsigned long zero_pfn __read_mostly;
 unsigned long highest_memmap_pfn __read_mostly;
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
 static int __init init_zero_pfn(void)
 {
 	zero_pfn = page_to_pfn(ZERO_PAGE(0));
 	return 0;
 }
 core_initcall(init_zero_pfn);
 #if defined(SPLIT_RSS_COUNTING)
 void sync_mm_rss(struct mm_struct *mm)
 {
 	int i;
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		if (current->rss_stat.count[i]) {
 			add_mm_counter(mm, i, current->rss_stat.count[i]);
 			current->rss_stat.count[i] = 0;
 		}
 	}
 	current->rss_stat.events = 0;
 }
 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
 {
 	struct task_struct *task = current;
 	if (likely(task->mm == mm))
 		task->rss_stat.count[member] += val;
 	else
 		add_mm_counter(mm, member, val);
 }
 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
 /* sync counter once per 64 page faults */
 #define TASK_RSS_EVENTS_THRESH	(64)
 static void check_sync_rss_stat(struct task_struct *task)
 {
 	if (unlikely(task != current))
 		return;
 	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
 		sync_mm_rss(task->mm);
 }
 #else /* SPLIT_RSS_COUNTING */
 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
 static void check_sync_rss_stat(struct task_struct *task)
 {
 }
 #endif /* SPLIT_RSS_COUNTING */
 #ifdef HAVE_GENERIC_MMU_GATHER
 static int tlb_next_batch(struct mmu_gather *tlb)
 {
 	struct mmu_gather_batch *batch;
 	batch = tlb->active;
 	if (batch->next) {
 		tlb->active = batch->next;
 		return 1;
 	}
 	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
 		return 0;
 	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
 	if (!batch)
 		return 0;
 	tlb->batch_count++;
 	batch->next = NULL;
 	batch->nr   = 0;
 	batch->max  = MAX_GATHER_BATCH;
 	tlb->active->next = batch;
 	tlb->active = batch;
 	return 1;
 }
 /* tlb_gather_mmu
  *	Called to initialize an (on-stack) mmu_gather structure for page-table
  *	tear-down from @mm. The @fullmm argument is used when @mm is without
  *	users and we're going to destroy the full address space (exit/execve).
  */
 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
 {
 	tlb->mm = mm;
 	tlb->fullmm     = fullmm;
 	tlb->need_flush_all = 0;
 	tlb->start	= -1UL;
 	tlb->end	= 0;
 	tlb->need_flush = 0;
 	tlb->fast_mode  = (num_possible_cpus() == 1);
 	tlb->local.next = NULL;
 	tlb->local.nr   = 0;
 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 	tlb->active     = &tlb->local;
 	tlb->batch_count = 0;
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
 #endif
 }
 void tlb_flush_mmu(struct mmu_gather *tlb)
 {
 	struct mmu_gather_batch *batch;
 	if (!tlb->need_flush)
 		return;
 	tlb->need_flush = 0;
 	tlb_flush(tlb);
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb_table_flush(tlb);
 #endif
 	if (tlb_fast_mode(tlb))
 		return;
 	for (batch = &tlb->local; batch; batch = batch->next) {
 		free_pages_and_swap_cache(batch->pages, batch->nr);
 		batch->nr = 0;
 	}
 	tlb->active = &tlb->local;
 }
 /* tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
  *	that were required.
  */
 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
 	struct mmu_gather_batch *batch, *next;
 	tlb->start = start;
 	tlb->end   = end;
 	tlb_flush_mmu(tlb);
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 	for (batch = tlb->local.next; batch; batch = next) {
 		next = batch->next;
 		free_pages((unsigned long)batch, 0);
 	}
 	tlb->local.next = NULL;
 }
 /* __tlb_remove_page
  *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
  *	handling the additional races in SMP caused by other CPUs caching valid
  *	mappings in their TLBs. Returns the number of free page slots left.
  *	When out of page slots we must call tlb_flush_mmu().
  */
 int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 	struct mmu_gather_batch *batch;
 	VM_BUG_ON(!tlb->need_flush);
 	if (tlb_fast_mode(tlb)) {
 		free_page_and_swap_cache(page);
 		return 1; /* avoid calling tlb_flush_mmu() */
 	}
 	batch = tlb->active;
 	batch->pages[batch->nr++] = page;
 	if (batch->nr == batch->max) {
 		if (!tlb_next_batch(tlb))
 			return 0;
 		batch = tlb->active;
 	}
 	VM_BUG_ON(batch->nr > batch->max);
 	return batch->max - batch->nr;
 }
 #endif /* HAVE_GENERIC_MMU_GATHER */
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 /*
  * See the comment near struct mmu_table_batch.
  */
 static void tlb_remove_table_smp_sync(void *arg)
 {
 	/* Simply deliver the interrupt */
 }
 static void tlb_remove_table_one(void *table)
 {
 	/*
 	 * This isn't an RCU grace period and hence the page-tables cannot be
 	 * assumed to be actually RCU-freed.
 	 *
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling. See the comment near struct mmu_table_batch.
 	 */
 	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 	__tlb_remove_table(table);
 }
 static void tlb_remove_table_rcu(struct rcu_head *head)
 {
 	struct mmu_table_batch *batch;
 	int i;
 	batch = container_of(head, struct mmu_table_batch, rcu);
 	for (i = 0; i < batch->nr; i++)
 		__tlb_remove_table(batch->tables[i]);
 	free_page((unsigned long)batch);
 }
 void tlb_table_flush(struct mmu_gather *tlb)
 {
 	struct mmu_table_batch **batch = &tlb->batch;
 	if (*batch) {
 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
 		*batch = NULL;
 	}
 }
 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 {
 	struct mmu_table_batch **batch = &tlb->batch;
 	tlb->need_flush = 1;
 	/*
 	 * When there's less then two users of this mm there cannot be a
 	 * concurrent page-table walk.
 	 */
 	if (atomic_read(&tlb->mm->mm_users) < 2) {
 		__tlb_remove_table(table);
 		return;
 	}
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
 			tlb_remove_table_one(table);
 			return;
 		}
 		(*batch)->nr = 0;
 	}
 	(*batch)->tables[(*batch)->nr++] = table;
 	if ((*batch)->nr == MAX_TABLE_BATCH)
 		tlb_table_flush(tlb);
 }
 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
  * very seldom) called out from the p?d_none_or_clear_bad macros.
  */
 void pgd_clear_bad(pgd_t *pgd)
 {
 	pgd_ERROR(*pgd);
 	pgd_clear(pgd);
 }
 void pud_clear_bad(pud_t *pud)
 {
 	pud_ERROR(*pud);
 	pud_clear(pud);
 }
 void pmd_clear_bad(pmd_t *pmd)
 {
 	pmd_ERROR(*pmd);
 	pmd_clear(pmd);
 }
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			   unsigned long addr)
 {
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
 	tlb->mm->nr_ptes--;
 }
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	unsigned long start;
 	start = addr;
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		free_pte_range(tlb, pmd, addr);
 	} while (pmd++, addr = next, addr != end);
 	start &= PUD_MASK;
 	if (start < floor)
 		return;
 	if (ceiling) {
 		ceiling &= PUD_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		return;
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd, start);
 }
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				unsigned long floor, unsigned long ceiling)
 {
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
 	start = addr;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 	} while (pud++, addr = next, addr != end);
 	start &= PGDIR_MASK;
 	if (start < floor)
 		return;
 	if (ceiling) {
 		ceiling &= PGDIR_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		return;
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
 }
 /*
  * This function frees user-level page tables of a process.
  *
  * Must be called with pagetable lock held.
  */
 void free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	/*
 	 * The next few lines have given us lots of grief...
 	 *
 	 * Why are we testing PMD* at this top level?  Because often
 	 * there will be no work to do at all, and we'd prefer not to
 	 * go all the way down to the bottom just to discover that.
 	 *
 	 * Why all these "- 1"s?  Because 0 represents both the bottom
 	 * of the address space and the top of it (using -1 for the
 	 * top wouldn't help much: the masks would do the wrong thing).
 	 * The rule is that addr 0 and floor 0 refer to the bottom of
 	 * the address space, but end 0 and ceiling 0 refer to the top
 	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
 	 * that end 0 case should be mythical).
 	 *
 	 * Wherever addr is brought up or ceiling brought down, we must
 	 * be careful to reject "the opposite 0" before it confuses the
 	 * subsequent tests.  But what about where end is brought down
 	 * by PMD_SIZE below? no, end can't go down to 0 there.
 	 *
 	 * Whereas we round start (addr) and ceiling down, by different
 	 * masks at different levels, in order to test whether a table
 	 * now has no other vmas using it, so can be freed, we don't
 	 * bother to round floor or end up - the tests don't need that.
 	 */
 	addr &= PMD_MASK;
 	if (addr < floor) {
 		addr += PMD_SIZE;
 		if (!addr)
 			return;
 	}
 	if (ceiling) {
 		ceiling &= PMD_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		end -= PMD_SIZE;
 	if (addr > end - 1)
 		return;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
 		 * pgtables
 		 */
 		unlink_anon_vmas(vma);
 		unlink_file_vma(vma);
 		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
 			/*
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
 				unlink_anon_vmas(vma);
 				unlink_file_vma(vma);
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		}
 		vma = next;
 	}
 }
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmd_t *pmd, unsigned long address)
 {
 	pgtable_t new = pte_alloc_one(mm, address);
 	int wait_split_huge_page;
 	if (!new)
 		return -ENOMEM;
 	/*
 	 * Ensure all pte setup (eg. pte page lock and page clearing) are
 	 * visible before the pte is made visible to other CPUs by being
 	 * put into page tables.
 	 *
 	 * The other side of the story is the pointer chasing in the page
 	 * table walking code (when walking the page table without locking;
 	 * ie. most of the time). Fortunately, these data accesses consist
 	 * of a chain of data-dependent loads, meaning most CPUs (alpha
 	 * being the notable exception) will already guarantee loads are
 	 * seen in-order. See the alpha page table accessors for the
 	 * smp_read_barrier_depends() barriers in page table walking code.
 	 */
 	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 	spin_lock(&mm->page_table_lock);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 		mm->nr_ptes++;
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
 		wait_split_huge_page = 1;
 	spin_unlock(&mm->page_table_lock);
 	if (new)
 		pte_free(mm, new);
 	if (wait_split_huge_page)
 		wait_split_huge_page(vma->anon_vma, pmd);
 	return 0;
 }
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 {
 	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
 	if (!new)
 		return -ENOMEM;
 	smp_wmb(); /* See comment in __pte_alloc */
 	spin_lock(&init_mm.page_table_lock);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 		pmd_populate_kernel(&init_mm, pmd, new);
 		new = NULL;
 	} else
 		VM_BUG_ON(pmd_trans_splitting(*pmd));
 	spin_unlock(&init_mm.page_table_lock);
 	if (new)
 		pte_free_kernel(&init_mm, new);
 	return 0;
 }
 static inline void init_rss_vec(int *rss)
 {
 	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
 }
 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 {
 	int i;
 	if (current->mm == mm)
 		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
 }
 /*
  * This function is called to print an error when a bad pte
  * is found. For example, we might have a PFN-mapped pte in
  * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 			  pte_t pte, struct page *page)
 {
 	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
 	pud_t *pud = pud_offset(pgd, addr);
 	pmd_t *pmd = pmd_offset(pud, addr);
 	struct address_space *mapping;
 	pgoff_t index;
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			return;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 				"BUG: Bad page map: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
 	index = linear_page_index(vma, addr);
 	printk(KERN_ALERT
 		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
 		current->comm,
 		(long long)pte_val(pte), (long long)pmd_val(*pmd));
 	if (page)
 		dump_page(page);
 	printk(KERN_ALERT
 		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
 		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
 	/*
 	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
 	 */
 	if (vma->vm_ops)
 		print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
 				(unsigned long)vma->vm_ops->fault);
 	if (vma->vm_file && vma->vm_file->f_op)
 		print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
 				(unsigned long)vma->vm_file->f_op->mmap);
 	dump_stack();
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
 /*
  * vm_normal_page -- This function gets the "struct page" associated with a pte.
  *
  * "Special" mappings do not wish to be associated with a "struct page" (either
  * it doesn't exist, or it exists but they don't want to touch it). In this
  * case, NULL is returned here. "Normal" mappings do have a struct page.
  *
  * There are 2 broad cases. Firstly, an architecture may define a pte_special()
  * pte bit, in which case this function is trivial. Secondly, an architecture
  * may not have a spare pte bit, which requires a more complicated scheme,
  * described below.
  *
  * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
  * special mapping (even if there are underlying and valid "struct pages").
  * COWed pages of a VM_PFNMAP are always normal.
  *
  * The way we recognize COWed pages within VM_PFNMAP mappings is through the
  * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
  * set, and the vm_pgoff will point to the first PFN mapped: thus every special
  * mapping will always honor the rule
  *
  *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
  *
  * And for normal mappings this is false.
  *
  * This restricts such mappings to be a linear translation from virtual address
  * to pfn. To get around this restriction, we allow arbitrary mappings so long
  * as the vma is not a COW mapping; in that case, we know that all ptes are
  * special (because none can have been COWed).
  *
  *
  * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
  *
  * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
  * page" backing, however the difference is that _all_ pages with a struct
  * page (that is, those where pfn_valid is true) are refcounted and considered
  * normal pages by the VM. The disadvantage is that pages are refcounted
  * (which can be slower and simply not an option for some PFNMAP users). The
  * advantage is that we don't have to follow the strict linearity rule of
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 # define HAVE_PTE_SPECIAL 1
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 				pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
 		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 			return NULL;
 		if (!is_zero_pfn(pfn))
 			print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
 	/* !HAVE_PTE_SPECIAL case follows: */
 	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
 		if (vma->vm_flags & VM_MIXEDMAP) {
 			if (!pfn_valid(pfn))
 				return NULL;
 			goto out;
 		} else {
 			unsigned long off;
 			off = (addr - vma->vm_start) >> PAGE_SHIFT;
 			if (pfn == vma->vm_pgoff + off)
 				return NULL;
 			if (!is_cow_mapping(vma->vm_flags))
 				return NULL;
 		}
 	}
 	if (is_zero_pfn(pfn))
 		return NULL;
 check_pfn:
 	if (unlikely(pfn > highest_memmap_pfn)) {
 		print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
 	/*
 	 * NOTE! We still have PageReserved() pages in the page tables.
 	 * eg. VDSO mappings can cause them to exist.
 	 */
 out:
 	return pfn_to_page(pfn);
 }
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
  */
 static inline unsigned long
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr, int *rss)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
 		if (!pte_file(pte)) {
 			swp_entry_t entry = pte_to_swp_entry(pte);
 			if (swap_duplicate(entry) < 0)
 				return entry.val;
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
 				if (list_empty(&dst_mm->mmlist))
 					list_add(&dst_mm->mmlist,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 			if (likely(!non_swap_entry(entry)))
 				rss[MM_SWAPENTS]++;
 			else if (is_migration_entry(entry)) {
 				page = migration_entry_to_page(entry);
 				if (PageAnon(page))
 					rss[MM_ANONPAGES]++;
 				else
 					rss[MM_FILEPAGES]++;
 				if (is_write_migration_entry(entry) &&
 				    is_cow_mapping(vm_flags)) {
 					/*
 					 * COW mappings require pages in both
 					 * parent and child to be set to read.
 					 */
 					make_migration_entry_read(&entry);
 					pte = swp_entry_to_pte(entry);
 					set_pte_at(src_mm, addr, src_pte, pte);
 				}
 			}
 		}
 		goto out_set_pte;
 	}
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
 	 */
 	if (is_cow_mapping(vm_flags)) {
 		ptep_set_wrprotect(src_mm, addr, src_pte);
 		pte = pte_wrprotect(pte);
 	}
 	/*
 	 * If it's a shared mapping, mark it clean in
 	 * the child
 	 */
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	page = vm_normal_page(vma, addr, pte);
 	if (page) {
 		get_page(page);
 		page_dup_rmap(page);
 		if (PageAnon(page))
 			rss[MM_ANONPAGES]++;
 		else
 			rss[MM_FILEPAGES]++;
 	}
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 	return 0;
 }
 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
 		   unsigned long addr, unsigned long end)
 {
 	pte_t *orig_src_pte, *orig_dst_pte;
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[NR_MM_COUNTERS];
 	swp_entry_t entry = (swp_entry_t){0};
 again:
 	init_rss_vec(rss);
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	orig_src_pte = src_pte;
 	orig_dst_pte = dst_pte;
 	arch_enter_lazy_mmu_mode();
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
 		 */
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
 			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
 			progress++;
 			continue;
 		}
 		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
 							vma, addr, rss);
 		if (entry.val)
 			break;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap(orig_src_pte);
 	add_mm_rss_vec(dst_mm, rss);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 	if (entry.val) {
 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
 			return -ENOMEM;
 		progress = 0;
 	}
 	if (addr != end)
 		goto again;
 	return 0;
 }
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pmd_t *src_pmd, *dst_pmd;
 	unsigned long next;
 	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
 	if (!dst_pmd)
 		return -ENOMEM;
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*src_pmd)) {
 			int err;
 			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
 			err = copy_huge_pmd(dst_mm, src_mm,
 					    dst_pmd, src_pmd, addr, vma);
 			if (err == -ENOMEM)
 				return -ENOMEM;
 			if (!err)
 				continue;
 			/* fall through */
 		}
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
 }
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pud_t *src_pud, *dst_pud;
 	unsigned long next;
 	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
 	if (!dst_pud)
 		return -ENOMEM;
 	src_pud = pud_offset(src_pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
 }
 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		struct vm_area_struct *vma)
 {
 	pgd_t *src_pgd, *dst_pgd;
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	bool is_cow;
 	int ret;
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.
 	 * Fork becomes much lighter when there are big shared or private
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
 	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
 			       VM_PFNMAP | VM_MIXEDMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		/*
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
 		 */
 		ret = track_pfn_copy(vma);
 		if (ret)
 			return ret;
 	}
 	/*
 	 * We need to invalidate the secondary MMU mappings only when
 	 * there could be a permission downgrade on the ptes of the
 	 * parent mm. And a permission downgrade will only happen if
 	 * is_cow_mapping() returns true.
 	 */
 	is_cow = is_cow_mapping(vma->vm_flags);
 	mmun_start = addr;
 	mmun_end   = end;
 	if (is_cow)
 		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
 						    mmun_end);
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
 		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
 					    vma, addr, next))) {
 			ret = -ENOMEM;
 			break;
 		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 	if (is_cow)
 		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
 	return ret;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	int force_flush = 0;
 	int rss[NR_MM_COUNTERS];
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
 again:
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	pte = start_pte;
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
 			continue;
 		}
 		if (pte_present(ptent)) {
 			struct page *page;
 			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
 				 * invalidate cache without truncating:
 				 * unmap shared but keep private pages.
 				 */
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
 				/*
 				 * Each page->index must be checked when
 				 * invalidating or truncating nonlinear.
 				 */
 				if (details->nonlinear_vma &&
 				    (page->index < details->first_index ||
 				     page->index > details->last_index))
 					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
 				rss[MM_ANONPAGES]--;
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
 				rss[MM_FILEPAGES]--;
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
 				print_bad_pte(vma, addr, ptent, page);
 			force_flush = !__tlb_remove_page(tlb, page);
 			if (force_flush)
 				break;
 			continue;
 		}
 		/*
 		 * If details->check_mapping, we leave swap entries;
 		 * if details->nonlinear_vma, we leave file entries.
 		 */
 		if (unlikely(details))
 			continue;
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(ptent);
 			if (!non_swap_entry(entry))
 				rss[MM_SWAPENTS]--;
 			else if (is_migration_entry(entry)) {
 				struct page *page;
 				page = migration_entry_to_page(entry);
 				if (PageAnon(page))
 					rss[MM_ANONPAGES]--;
 				else
 					rss[MM_FILEPAGES]--;
 			}
 			if (unlikely(!free_swap_and_cache(entry)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	add_mm_rss_vec(mm, rss);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(start_pte, ptl);
 	/*
 	 * mmu_gather ran out of room to batch pages, we break out of
 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
 	 * and page-free while holding it.
 	 */
 	if (force_flush) {
 		force_flush = 0;
 #ifdef HAVE_GENERIC_MMU_GATHER
 		tlb->start = addr;
 		tlb->end = end;
 #endif
 		tlb_flush_mmu(tlb);
 		if (addr != end)
 			goto again;
 	}
 	return addr;
 }
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE) {
 #ifdef CONFIG_DEBUG_VM
 				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
 					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
 						__func__, addr, end,
 						vma->vm_start,
 						vma->vm_end);
 					BUG();
 				}
 #endif
 				split_huge_page_pmd(vma, addr, pmd);
 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
 				goto next;
 			/* fall through */
 		}
 		/*
 		 * Here there can be other concurrent MADV_DONTNEED or
 		 * trans huge page faults running, and if the pmd is
 		 * none or trans huge it can change under us. This is
 		 * because MADV_DONTNEED holds the mmap_sem in read
 		 * mode.
 		 */
 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			goto next;
 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
 next:
 		cond_resched();
 	} while (pmd++, addr = next, addr != end);
 	return addr;
 }
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
 	} while (pud++, addr = next, addr != end);
 	return addr;
 }
 static void unmap_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
 			     unsigned long addr, unsigned long end,
 			     struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	if (details && !details->check_mapping && !details->nonlinear_vma)
 		details = NULL;
 	BUG_ON(addr >= end);
 	mem_cgroup_uncharge_start();
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 	mem_cgroup_uncharge_end();
 }
 static void unmap_single_vma(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr,
 		struct zap_details *details)
 {
 	unsigned long start = max(vma->vm_start, start_addr);
 	unsigned long end;
 	if (start >= vma->vm_end)
 		return;
 	end = min(vma->vm_end, end_addr);
 	if (end <= vma->vm_start)
 		return;
 	if (vma->vm_file)
 		uprobe_munmap(vma, start, end);
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn(vma, 0, 0);
 	if (start != end) {
 		if (unlikely(is_vm_hugetlb_page(vma))) {
 			/*
 			 * It is undesirable to test vma->vm_file as it
 			 * should be non-null for valid hugetlb area.
 			 * However, vm_file will be NULL in the error
 			 * cleanup path of do_mmap_pgoff. When
 			 * hugetlbfs ->mmap method fails,
 			 * do_mmap_pgoff() nullifies vma->vm_file
 			 * before calling this function to clean up.
 			 * Since no pte has actually been setup, it is
 			 * safe to do nothing in this case.
 			 */
 			if (vma->vm_file) {
 				mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
 				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
 				mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 			}
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
 	}
 }
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlb: address of the caller's struct mmu_gather
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
  *
  * Unmap all pages in the vma list.
  *
  * Only addresses between `start' and `end' will be unmapped.
  *
  * The VMA list must be sorted in ascending virtual address order.
  *
  * unmap_vmas() assumes that the caller will flush the whole unmapped address
  * range after unmap_vmas() returns.  So the only responsibility here is to
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
 void unmap_vmas(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
 		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
 	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 }
 /**
  * zap_page_range - remove user pages in a given range
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
  * @details: details of nonlinear truncation or shared cache invalidation
  *
  * Caller must protect the VMA list
  */
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
 	unsigned long end = start + size;
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
 		unmap_single_vma(&tlb, vma, start, end, details);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	tlb_finish_mmu(&tlb, start, end);
 }
 /**
  * zap_page_range_single - remove user pages in a given range
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
  * @details: details of nonlinear truncation or shared cache invalidation
  *
  * The range must fit into one VMA.
  */
 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
 	unsigned long end = address + size;
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, address, end);
 	unmap_single_vma(&tlb, vma, address, end, details);
 	mmu_notifier_invalidate_range_end(mm, address, end);
 	tlb_finish_mmu(&tlb, address, end);
 }
 /**
  * zap_vma_ptes - remove ptes mapping the vma
  * @vma: vm_area_struct holding ptes to be zapped
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
  *
  * This function only unmaps ptes assigned to VM_PFNMAP vmas.
  *
  * The entire address range must be fully contained within the vma.
  *
  * Returns 0 if successful.
  */
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size)
 {
 	if (address < vma->vm_start || address + size > vma->vm_end ||
 	    		!(vma->vm_flags & VM_PFNMAP))
 		return -1;
 	zap_page_range_single(vma, address, size, NULL);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 /**
  * follow_page_mask - look up a page descriptor from a user-virtual address
  * @vma: vm_area_struct mapping @address
  * @address: virtual address to look up
  * @flags: flags modifying lookup behaviour
  * @page_mask: on output, *page_mask is set according to the size of the page
  *
  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
  *
  * Returns the mapped (struct page *), %NULL if no mapping exists, or
  * an error pointer if there is a mapping to something not represented
  * by a page descriptor (see also vm_normal_page()).
  */
 struct page *follow_page_mask(struct vm_area_struct *vma,
 			      unsigned long address, unsigned int flags,
 			      unsigned int *page_mask)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 	*page_mask = 0;
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
 		BUG_ON(flags & FOLL_GET);
 		goto out;
 	}
 	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		goto no_page_table;
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud))
 		goto no_page_table;
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
 		goto out;
 	}
 	if (unlikely(pud_bad(*pud)))
 		goto no_page_table;
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		goto no_page_table;
 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
 		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
 	}
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		goto no_page_table;
 	if (pmd_trans_huge(*pmd)) {
 		if (flags & FOLL_SPLIT) {
 			split_huge_page_pmd(vma, address, pmd);
 			goto split_fallthrough;
 		}
 		spin_lock(&mm->page_table_lock);
 		if (likely(pmd_trans_huge(*pmd))) {
 			if (unlikely(pmd_trans_splitting(*pmd))) {
 				spin_unlock(&mm->page_table_lock);
 				wait_split_huge_page(vma->anon_vma, pmd);
 			} else {
 				page = follow_trans_huge_pmd(vma, address,
 							     pmd, flags);
 				spin_unlock(&mm->page_table_lock);
 				*page_mask = HPAGE_PMD_NR - 1;
 				goto out;
 			}
 		} else
 			spin_unlock(&mm->page_table_lock);
 		/* fall through */
 	}
 split_fallthrough:
 	if (unlikely(pmd_bad(*pmd)))
 		goto no_page_table;
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	pte = *ptep;
 	if (!pte_present(pte)) {
 		swp_entry_t entry;
 		/*
 		 * KSM's break_ksm() relies upon recognizing a ksm page
 		 * even while it is being migrated, so for that case we
 		 * need migration_entry_wait().
 		 */
 		if (likely(!(flags & FOLL_MIGRATION)))
 			goto no_page;
 		if (pte_none(pte) || pte_file(pte))
 			goto no_page;
 		entry = pte_to_swp_entry(pte);
 		if (!is_migration_entry(entry))
 			goto no_page;
 		pte_unmap_unlock(ptep, ptl);
 		migration_entry_wait(mm, pmd, address);
 		goto split_fallthrough;
 	}
 	if ((flags & FOLL_NUMA) && pte_numa(pte))
 		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 	page = vm_normal_page(vma, address, pte);
 	if (unlikely(!page)) {
 		if ((flags & FOLL_DUMP) ||
 		    !is_zero_pfn(pte_pfn(pte)))
 			goto bad_page;
 		page = pte_page(pte);
 	}
 	if (flags & FOLL_GET)
 		get_page_foll(page);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
 			set_page_dirty(page);
 		/*
 		 * pte_mkyoung() would be more correct here, but atomic care
 		 * is needed to avoid losing the dirty bit: it is easier to use
 		 * mark_page_accessed().
 		 */
 		mark_page_accessed(page);
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		/*
 		 * The preliminary mapping check is mainly to avoid the
 		 * pointless overhead of lock_page on the ZERO_PAGE
 		 * which might bounce very badly if there is contention.
 		 *
 		 * If the page is already locked, we don't need to
 		 * handle it now - vmscan will handle it later if and
 		 * when it attempts to reclaim the page.
 		 */
 		if (page->mapping && trylock_page(page)) {
 			lru_add_drain();  /* push cached pages to LRU */
 			/*
 			 * Because we lock page here, and migration is
 			 * blocked by the pte's page reference, and we
 			 * know the page is still mapped, we don't even
 			 * need to check for file-cache page truncation.
 			 */
 			mlock_vma_page(page);
 			unlock_page(page);
 		}
 	}
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
 	return page;
 bad_page:
 	pte_unmap_unlock(ptep, ptl);
 	return ERR_PTR(-EFAULT);
 no_page:
 	pte_unmap_unlock(ptep, ptl);
 	if (!pte_none(pte))
 		return page;
 no_page_table:
 	/*
 	 * When core dumping an enormous anonymous area that nobody
 	 * has touched so far, we don't want to allocate unnecessary pages or
 	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
 	 * then get_dump_page() will return NULL to leave a hole in the dump.
 	 * But we can only make this optimization where a hole would surely
 	 * be zero-filled if handle_mm_fault() actually did handle it.
 	 */
 	if ((flags & FOLL_DUMP) &&
 	    (!vma->vm_ops || !vma->vm_ops->fault))
 		return ERR_PTR(-EFAULT);
 	return page;
 }
 static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	return stack_guard_page_start(vma, addr) ||
 	       stack_guard_page_end(vma, addr+PAGE_SIZE);
 }
 /**
  * __get_user_pages() - pin user pages in memory
  * @tsk:	task_struct of target task
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
  * @gup_flags:	flags modifying pin behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long. Or NULL, if caller
  *		only intends to ensure the pages are faulted in.
  * @vmas:	array of pointers to vmas corresponding to each page.
  *		Or NULL if the caller does not require them.
  * @nonblocking: whether waiting for disk IO or mmap_sem contention
  *
  * Returns number of pages pinned. This may be fewer than the number
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno. Each page returned must be released
  * with a put_page() call when it is finished with. vmas will only
  * remain valid while mmap_sem is held.
  *
  * Must be called with mmap_sem held for read or write.
  *
  * __get_user_pages walks a process's page tables and takes a reference to
  * each struct page that each user address corresponds to at a given
  * instant. That is, it takes the page that would be accessed if a user
  * thread accesses the given user virtual address at that instant.
  *
  * This does not guarantee that the page exists in the user mappings when
  * __get_user_pages returns, and there may even be a completely different
  * page there in some cases (eg. if mmapped pagecache has been invalidated
  * and subsequently re faulted). However it does guarantee that the page
  * won't be freed completely. And mostly callers simply care that the page
  * contains data that was valid *at some point in time*. Typically, an IO
  * or similar operation cannot guarantee anything stronger anyway because
  * locks can't be held over the syscall boundary.
  *
  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
  * appropriate) must be called after the page is finished with, and
  * before put_page is called.
  *
  * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
  * or mmap_sem contention, and if waiting is needed to pin all pages,
  * *@nonblocking will be set to 0.
  *
  * In most cases, get_user_pages or get_user_pages_fast should be used
  * instead of __get_user_pages. __get_user_pages should be used only if
  * you need some special @gup_flags.
  */
 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *nonblocking)
 {
 	long i;
 	unsigned long vm_flags;
 	unsigned int page_mask;
 	if (!nr_pages)
 		return 0;
 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 	/*
 	 * Require read or write permissions.
 	 * If FOLL_FORCE is set, we only require the "MAY" flags.
 	 */
 	vm_flags  = (gup_flags & FOLL_WRITE) ?
 			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 	vm_flags &= (gup_flags & FOLL_FORCE) ?
 			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	/*
 	 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
 	 * would be called on PROT_NONE ranges. We must never invoke
 	 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
 	 * page faults would unprotect the PROT_NONE ranges if
 	 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
 	 * bitflag. So to avoid that, don't set FOLL_NUMA if
 	 * FOLL_FORCE is set.
 	 */
 	if (!(gup_flags & FOLL_FORCE))
 		gup_flags |= FOLL_NUMA;
 	i = 0;
 	do {
 		struct vm_area_struct *vma;
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(mm, start)) {
 			unsigned long pg = start & PAGE_MASK;
 			pgd_t *pgd;
 			pud_t *pud;
 			pmd_t *pmd;
 			pte_t *pte;
 			/* user gate pages are read-only */
 			if (gup_flags & FOLL_WRITE)
 				return i ? : -EFAULT;
 			if (pg > TASK_SIZE)
 				pgd = pgd_offset_k(pg);
 			else
 				pgd = pgd_offset_gate(mm, pg);
 			BUG_ON(pgd_none(*pgd));
 			pud = pud_offset(pgd, pg);
 			BUG_ON(pud_none(*pud));
 			pmd = pmd_offset(pud, pg);
 			if (pmd_none(*pmd))
 				return i ? : -EFAULT;
 			VM_BUG_ON(pmd_trans_huge(*pmd));
 			pte = pte_offset_map(pmd, pg);
 			if (pte_none(*pte)) {
 				pte_unmap(pte);
 				return i ? : -EFAULT;
 			}
 			vma = get_gate_vma(mm);
 			if (pages) {
 				struct page *page;
 				page = vm_normal_page(vma, start, *pte);
 				if (!page) {
 					if (!(gup_flags & FOLL_DUMP) &&
 					     is_zero_pfn(pte_pfn(*pte)))
 						page = pte_page(*pte);
 					else {
 						pte_unmap(pte);
 						return i ? : -EFAULT;
 					}
 				}
 				pages[i] = page;
 				get_page(page);
 			}
 			pte_unmap(pte);
 			page_mask = 0;
 			goto next_page;
 		}
 		if (!vma ||
 		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
 		    !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
 					&start, &nr_pages, i, gup_flags);
 			continue;
 		}
 		do {
 			struct page *page;
 			unsigned int foll_flags = gup_flags;
 			unsigned int page_increm;
 			/*
 			 * If we have a pending SIGKILL, don't keep faulting
 			 * pages and potentially allocating memory.
 			 */
 			if (unlikely(fatal_signal_pending(current)))
 				return i ? i : -ERESTARTSYS;
 			cond_resched();
 			while (!(page = follow_page_mask(vma, start,
 						foll_flags, &page_mask))) {
 				int ret;
 				unsigned int fault_flags = 0;
 				/* For mlock, just skip the stack guard page. */
 				if (foll_flags & FOLL_MLOCK) {
 					if (stack_guard_page(vma, start))
 						goto next_page;
 				}
 				if (foll_flags & FOLL_WRITE)
 					fault_flags |= FAULT_FLAG_WRITE;
 				if (nonblocking)
 					fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 				if (foll_flags & FOLL_NOWAIT)
 					fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
 				ret = handle_mm_fault(mm, vma, start,
 							fault_flags);
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
 					if (ret & (VM_FAULT_HWPOISON |
 						   VM_FAULT_HWPOISON_LARGE)) {
 						if (i)
 							return i;
 						else if (gup_flags & FOLL_HWPOISON)
 							return -EHWPOISON;
 						else
 							return -EFAULT;
 					}
 					if (ret & VM_FAULT_SIGBUS)
 						return i ? i : -EFAULT;
 					BUG();
 				}
 				if (tsk) {
 					if (ret & VM_FAULT_MAJOR)
 						tsk->maj_flt++;
 					else
 						tsk->min_flt++;
 				}
 				if (ret & VM_FAULT_RETRY) {
 					if (nonblocking)
 						*nonblocking = 0;
 					return i;
 				}
 				/*
 				 * The VM_FAULT_WRITE bit tells us that
 				 * do_wp_page has broken COW when necessary,
 				 * even if maybe_mkwrite decided not to set
 				 * pte_write. We can thus safely do subsequent
 				 * page lookups as if they were reads. But only
 				 * do so when looping for pte_write is futile:
 				 * in some cases userspace may also be wanting
 				 * to write to the gotten user page, which a
 				 * read fault here might prevent (a readonly
 				 * page might get reCOWed by userspace write).
 				 */
 				if ((ret & VM_FAULT_WRITE) &&
 				    !(vma->vm_flags & VM_WRITE))
 					foll_flags &= ~FOLL_WRITE;
 				cond_resched();
 			}
 			if (IS_ERR(page))
 				return i ? i : PTR_ERR(page);
 			if (pages) {
 				pages[i] = page;
 				flush_anon_page(vma, page, start);
 				flush_dcache_page(page);
 				page_mask = 0;
 			}
 next_page:
 			if (vmas) {
 				vmas[i] = vma;
 				page_mask = 0;
 			}
 			page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
 			if (page_increm > nr_pages)
 				page_increm = nr_pages;
 			i += page_increm;
 			start += page_increm * PAGE_SIZE;
 			nr_pages -= page_increm;
 		} while (nr_pages && start < vma->vm_end);
 	} while (nr_pages);
 	return i;
 }
 EXPORT_SYMBOL(__get_user_pages);
 /*
  * fixup_user_fault() - manually resolve a user page fault
  * @tsk:	the task_struct to use for page fault accounting, or
  *		NULL if faults are not to be recorded.
  * @mm:		mm_struct of target mm
  * @address:	user address
  * @fault_flags:flags to pass down to handle_mm_fault()
  *
  * This is meant to be called in the specific scenario where for locking reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
  * section), this returns -EFAULT, and we want to resolve the user fault before
  * trying again.
  *
  * Typically this is meant to be used by the futex code.
  *
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
  * handle_mm_fault() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to the page because they are maintained in software.  On
  * such architectures, gup() will not be enough to make a subsequent access
  * succeed.
  *
  * This should be called with the mm_sem held for read.
  */
 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 		     unsigned long address, unsigned int fault_flags)
 {
 	struct vm_area_struct *vma;
 	int ret;
 	vma = find_extend_vma(mm, address);
 	if (!vma || address < vma->vm_start)
 		return -EFAULT;
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
 		if (ret & VM_FAULT_OOM)
 			return -ENOMEM;
 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
 			return -EHWPOISON;
 		if (ret & VM_FAULT_SIGBUS)
 			return -EFAULT;
 		BUG();
 	}
 	if (tsk) {
 		if (ret & VM_FAULT_MAJOR)
 			tsk->maj_flt++;
 		else
 			tsk->min_flt++;
 	}
 	return 0;
 }
 /*
  * get_user_pages() - pin user pages in memory
  * @tsk:	the task_struct to use for page fault accounting, or
  *		NULL if faults are not to be recorded.
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
  * @write:	whether pages will be written to by the caller
  * @force:	whether to force write access even if user mapping is
  *		readonly. This will result in the page being COWed even
  *		in MAP_SHARED mappings. You do not want this.
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long. Or NULL, if caller
  *		only intends to ensure the pages are faulted in.
  * @vmas:	array of pointers to vmas corresponding to each page.
  *		Or NULL if the caller does not require them.
  *
  * Returns number of pages pinned. This may be fewer than the number
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno. Each page returned must be released
  * with a put_page() call when it is finished with. vmas will only
  * remain valid while mmap_sem is held.
  *
  * Must be called with mmap_sem held for read or write.
  *
  * get_user_pages walks a process's page tables and takes a reference to
  * each struct page that each user address corresponds to at a given
  * instant. That is, it takes the page that would be accessed if a user
  * thread accesses the given user virtual address at that instant.
  *
  * This does not guarantee that the page exists in the user mappings when
  * get_user_pages returns, and there may even be a completely different
  * page there in some cases (eg. if mmapped pagecache has been invalidated
  * and subsequently re faulted). However it does guarantee that the page
  * won't be freed completely. And mostly callers simply care that the page
  * contains data that was valid *at some point in time*. Typically, an IO
  * or similar operation cannot guarantee anything stronger anyway because
  * locks can't be held over the syscall boundary.
  *
  * If write=0, the page must not be written to. If the page is written to,
  * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
  * after the page is finished with, and before put_page is called.
  *
  * get_user_pages is typically used for fewer-copy IO operations, to get a
  * handle on the memory by some means other than accesses via the user virtual
  * addresses. The pages may be submitted for DMA to devices or accessed via
  * their kernel linear mapping (via the kmap APIs). Care should be taken to
  * use the correct cache flushing APIs.
  *
  * See also get_user_pages_fast, for performance critical applications.
  */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages, int write,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
 	int flags = FOLL_TOUCH;
 	if (pages)
 		flags |= FOLL_GET;
 	if (write)
 		flags |= FOLL_WRITE;
 	if (force)
 		flags |= FOLL_FORCE;
 	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
 				NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
 /**
  * get_dump_page() - pin user page in memory while writing it to core dump
  * @addr: user address
  *
  * Returns struct page pointer of user page pinned for dump,
  * to be freed afterwards by page_cache_release() or put_page().
  *
  * Returns NULL on any kind of failure - a hole must then be inserted into
  * the corefile, to preserve alignment with its headers; and also returns
  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
  * allowing a hole to be left in the corefile to save diskspace.
  *
  * Called without mmap_sem, but after all other threads have been killed.
  */
 #ifdef CONFIG_ELF_CORE
 struct page *get_dump_page(unsigned long addr)
 {
 	struct vm_area_struct *vma;
 	struct page *page;
 	if (__get_user_pages(current, current->mm, addr, 1,
 			     FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
 			     NULL) < 1)
 		return NULL;
 	flush_cache_page(vma, addr, page_to_pfn(page));
 	return page;
 }
 #endif /* CONFIG_ELF_CORE */
 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 			spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
 	pud_t * pud = pud_alloc(mm, pgd, addr);
 	if (pud) {
 		pmd_t * pmd = pmd_alloc(mm, pud, addr);
 		if (pmd) {
 			VM_BUG_ON(pmd_trans_huge(*pmd));
 			return pte_alloc_map_lock(mm, pmd, addr, ptl);
 		}
 	}
 	return NULL;
 }
 /*
  * This is the old fallback for page remapping.
  *
  * For historical reasons, it only allows reserved pages. Only
  * old drivers should use this, and they needed to mark their
  * pages reserved for the old functions anyway.
  */
 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 			struct page *page, pgprot_t prot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
 	pte_t *pte;
 	spinlock_t *ptl;
 	retval = -EINVAL;
 	if (PageAnon(page))
 		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
 	inc_mm_counter_fast(mm, MM_FILEPAGES);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	retval = 0;
 	pte_unmap_unlock(pte, ptl);
 	return retval;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
 }
 /**
  * vm_insert_page - insert single page into user vma
  * @vma: user vma to map to
  * @addr: target user address of this page
  * @page: source kernel page
  *
  * This allows drivers to insert individual pages they've allocated
  * into a user vma.
  *
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
  * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
  * that. Your vma protection will have to be set up correctly, which
  * means that if you want a shared writable mapping, you'd better
  * ask for a shared writable mapping!
  *
  * The page does not need to be reserved.
  *
  * Usually this function is called from f_op->mmap() handler
  * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
  * Caller must set VM_MIXEDMAP on vma if it wants to call this
  * function from other places, for example from page-fault handler.
  */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 			struct page *page)
 {
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 	if (!page_count(page))
 		return -EINVAL;
 	if (!(vma->vm_flags & VM_MIXEDMAP)) {
 		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
 		BUG_ON(vma->vm_flags & VM_PFNMAP);
 		vma->vm_flags |= VM_MIXEDMAP;
 	}
 	return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t prot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
 	pte_t *pte, entry;
 	spinlock_t *ptl;
 	retval = -ENOMEM;
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
 	/* Ok, finally just insert the thing.. */
 	entry = pte_mkspecial(pfn_pte(pfn, prot));
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 	retval = 0;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
 }
 /**
  * vm_insert_pfn - insert single pfn into user vma
  * @vma: user vma to map to
  * @addr: target user address of this page
  * @pfn: source kernel pfn
  *
  * Similar to vm_insert_page, this allows drivers to insert individual pages
  * they've allocated into a user vma. Same comments apply.
  *
  * This function should only be called from a vm_ops->fault handler, and
  * in that case the handler should return NULL.
  *
  * vma cannot be a COW mapping.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
 {
 	int ret;
 	pgprot_t pgprot = vma->vm_page_prot;
 	/*
 	 * Technically, architectures with pte_special can avoid all these
 	 * restrictions (same for remap_pfn_range).  However we would like
 	 * consistency in testing and feature parity among all, so we should
 	 * try to keep these invariants in place for everybody.
 	 */
 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 						(VM_PFNMAP|VM_MIXEDMAP));
 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 	if (track_pfn_insert(vma, &pgprot, pfn))
 		return -EINVAL;
 	ret = insert_pfn(vma, addr, pfn, pgprot);
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
 {
 	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 	/*
 	 * If we don't have pte special, then we have to use the pfn_valid()
 	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
 	 * refcount the page if pfn_valid is true (hence insert_page rather
 	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 	 * without pte special, it would there be refcounted as a normal page.
 	 */
 	if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
 		struct page *page;
 		page = pfn_to_page(pfn);
 		return insert_page(vma, addr, page, vma->vm_page_prot);
 	}
 	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
 	spinlock_t *ptl;
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	pfn -= addr >> PAGE_SHIFT;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	do {
 		next = pmd_addr_end(addr, end);
 		if (remap_pte_range(mm, pmd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
 	pfn -= addr >> PAGE_SHIFT;
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 		if (remap_pmd_range(mm, pud, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 /**
  * remap_pfn_range - remap kernel memory to userspace
  * @vma: user vma to map to
  * @addr: target user address to start at
  * @pfn: physical address of kernel memory
  * @size: size of map area
  * @prot: page protection flags for this mapping
  *
  *  Note: this is only safe if the mm semaphore is held when called.
  */
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		    unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
 	/*
 	 * Physically remapped pages are special. Tell the
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
 	 *   VM_PFNMAP tells the core MM that the base pages are just
 	 *	raw PFN mappings, and do not have a "struct page" associated
 	 *	with them.
 	 *   VM_DONTEXPAND
 	 *      Disable vma merging and expanding with mremap().
 	 *   VM_DONTDUMP
 	 *      Omit vma from core dump, even when VM_IO turned off.
 	 *
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
 	 * See vm_normal_page() for details.
 	 */
 	if (is_cow_mapping(vma->vm_flags)) {
 		if (addr != vma->vm_start || end != vma->vm_end)
 			return -EINVAL;
 		vma->vm_pgoff = pfn;
 	}
 	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
 	if (err)
 		return -EINVAL;
 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 	if (err)
 		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
 /**
  * vm_iomap_memory - remap memory to userspace
  * @vma: user vma to map to
  * @start: start of area
  * @len: size of area
  *
  * This is a simplified io_remap_pfn_range() for common driver use. The
  * driver just needs to give us the physical memory range to be mapped,
  * we'll figure out the rest from the vma information.
  *
  * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
  * whatever write-combining details or similar.
  */
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
 {
 	unsigned long vm_len, pfn, pages;
 	/* Check that the physical memory area passed in looks valid */
 	if (start + len < start)
 		return -EINVAL;
 	/*
 	 * You *really* shouldn't map things that aren't page-aligned,
 	 * but we've historically allowed it because IO memory might
 	 * just have smaller alignment.
 	 */
 	len += start & ~PAGE_MASK;
 	pfn = start >> PAGE_SHIFT;
 	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
 	if (pfn + pages < pfn)
 		return -EINVAL;
 	/* We start the mapping 'vm_pgoff' pages into the area */
 	if (vma->vm_pgoff > pages)
 		return -EINVAL;
 	pfn += vma->vm_pgoff;
 	pages -= vma->vm_pgoff;
 	/* Can we fit all of the mapping? */
 	vm_len = vma->vm_end - vma->vm_start;
 	if (vm_len >> PAGE_SHIFT > pages)
 		return -EINVAL;
 	/* Ok, let it rip */
 	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 	pte = (mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
 		pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	BUG_ON(pmd_huge(*pmd));
 	arch_enter_lazy_mmu_mode();
 	token = pmd_pgtable(*pmd);
 	do {
 		err = fn(pte++, token, addr, data);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	int err;
 	BUG_ON(pud_huge(*pud));
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
 		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pud_t *pud;
 	unsigned long next;
 	int err;
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 /*
  * Scan a region of virtual memory, filling in page tables as necessary
  * and calling a provided function on each leaf page table.
  */
 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 			unsigned long size, pte_fn_t fn, void *data)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + size;
 	int err;
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 /*
  * handle_pte_fault chooses page fault handler according to an entry
  * which was read non-atomically.  Before making any commitment, on
  * those architectures or configurations (e.g. i386 with PAE) which
  * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
  * must check under lock before unmapping the pte and proceeding
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 				pte_t *page_table, pte_t orig_pte)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
 		spinlock_t *ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
 		same = pte_same(*page_table, orig_pte);
 		spin_unlock(ptl);
 	}
 #endif
 	pte_unmap(page_table);
 	return same;
 }
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
 	/*
 	 * If the source page was a PFN mapping, we don't have
 	 * a "struct page" for it. We do a best-effort copy by
 	 * just copying from the original user address. If that
 	 * fails, we just zero-fill it. Live with it.
 	 */
 	if (unlikely(!src)) {
 		void *kaddr = kmap_atomic(dst);
 		void __user *uaddr = (void __user *)(va & PAGE_MASK);
 		/*
 		 * This really shouldn't fail, because the page is there
 		 * in the page tables. But it might just be unreadable,
 		 * in which case we just give up and fill the result with
 		 * zeroes.
 		 */
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			clear_page(kaddr);
 		kunmap_atomic(kaddr);
 		flush_dcache_page(dst);
 	} else
 		copy_user_highpage(dst, src, va, vma);
 }
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
  *
  * Note that this routine assumes that the protection checks have been
  * done by the caller (the low-level page fault routine in most cases).
  * Thus we can safely just mark it writable once we've done any necessary
  * COW.
  *
  * We also mark the page dirty at this point even though the page will
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), with pte both mapped and locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 	__releases(ptl)
 {
 	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int ret = 0;
 	int page_mkwrite = 0;
 	struct page *dirty_page = NULL;
 	unsigned long mmun_start = 0;	/* For mmu_notifiers */
 	unsigned long mmun_end = 0;	/* For mmu_notifiers */
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page) {
 		/*
 		 * VM_MIXEDMAP !pfn_valid() case
 		 *
 		 * We should not cow pages in a shared writeable mapping.
 		 * Just mark the pages writable as we can't do any dirty
 		 * accounting on raw pfn maps.
 		 */
 		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 				     (VM_WRITE|VM_SHARED))
 			goto reuse;
 		goto gotten;
 	}
 	/*
 	 * Take out anonymous pages first, anonymous shared vmas are
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page) && !PageKsm(old_page)) {
 		if (!trylock_page(old_page)) {
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			lock_page(old_page);
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				goto unlock;
 			}
 			page_cache_release(old_page);
 		}
 		if (reuse_swap_page(old_page)) {
 			/*
 			 * The page is all ours.  Move it to our anon_vma so
 			 * the rmap code will not search our parent or siblings.
 			 * Protected against the rmap code by the page lock.
 			 */
 			page_move_anon_rmap(old_page, vma, address);
 			unlock_page(old_page);
 			goto reuse;
 		}
 		unlock_page(old_page);
 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
 		/*
 		 * Only catch write-faults on shared writable pages,
 		 * read-only shared pages can get COWed by
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 			struct vm_fault vmf;
 			int tmp;
 			vmf.virtual_address = (void __user *)(address &
 								PAGE_MASK);
 			vmf.pgoff = old_page->index;
 			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 			vmf.page = old_page;
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
 			 * for the page to get into an appropriate state.
 			 *
 			 * We do this without the lock held, so that it can
 			 * sleep if it needs to.
 			 */
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
 			if (unlikely(tmp &
 					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
 				ret = tmp;
 				goto unwritable_page;
 			}
 			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
 				lock_page(old_page);
 				if (!old_page->mapping) {
 					ret = 0; /* retry the fault */
 					unlock_page(old_page);
 					goto unwritable_page;
 				}
 			} else
 				VM_BUG_ON(!PageLocked(old_page));
 			/*
 			 * Since we dropped the lock we need to revalidate
 			 * the PTE as someone else may have changed it.  If
 			 * they did, we just return, as we can count on the
 			 * MMU to tell us if they didn't also make it writable.
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				goto unlock;
 			}
 			page_mkwrite = 1;
 		}
 		dirty_page = old_page;
 		get_page(dirty_page);
 reuse:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = pte_mkyoung(orig_pte);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, page_table);
 		pte_unmap_unlock(page_table, ptl);
 		ret |= VM_FAULT_WRITE;
 		if (!dirty_page)
 			return ret;
 		/*
 		 * Yes, Virginia, this is actually required to prevent a race
 		 * with clear_page_dirty_for_io() from clearing the page dirty
 		 * bit after it clear all dirty ptes, but before a racing
 		 * do_wp_page installs a dirty pte.
 		 *
 		 * __do_fault is protected similarly.
 		 */
 		if (!page_mkwrite) {
 			wait_on_page_locked(dirty_page);
 			set_page_dirty_balance(dirty_page, page_mkwrite);
 			/* file_update_time outside page_lock */
 			if (vma->vm_file)
 				file_update_time(vma->vm_file);
 		}
 		put_page(dirty_page);
 		if (page_mkwrite) {
 			struct address_space *mapping = dirty_page->mapping;
 			set_page_dirty(dirty_page);
 			unlock_page(dirty_page);
 			page_cache_release(dirty_page);
 			if (mapping)	{
 				/*
 				 * Some device drivers do not set page.mapping
 				 * but still dirty their pages
 				 */
 				balance_dirty_pages_ratelimited(mapping);
 			}
 		}
 		return ret;
 	}
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
 gotten:
 	pte_unmap_unlock(page_table, ptl);
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	if (is_zero_pfn(pte_pfn(orig_pte))) {
 		new_page = alloc_zeroed_user_highpage_movable(vma, address);
 		if (!new_page)
 			goto oom;
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!new_page)
 			goto oom;
 		cow_user_page(new_page, old_page, address, vma);
 	}
 	__SetPageUptodate(new_page);
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 	mmun_start  = address & PAGE_MASK;
 	mmun_end    = mmun_start + PAGE_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
 				dec_mm_counter_fast(mm, MM_FILEPAGES);
 				inc_mm_counter_fast(mm, MM_ANONPAGES);
 			}
 		} else
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
 		 * pte with the new entry. This will avoid a race condition
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
 		ptep_clear_flush(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
 		 * new page to be mapped directly into the secondary page table.
 		 */
 		set_pte_at_notify(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, page_table);
 		if (old_page) {
 			/*
 			 * Only after switching the pte to the new page may
 			 * we remove the mapcount here. Otherwise another
 			 * process may come and find the rmap count decremented
 			 * before the pte is switched to the new page, and
 			 * "reuse" the old page writing into it while our pte
 			 * here still points into it and can be read by other
 			 * threads.
 			 *
 			 * The critical issue is to order this
 			 * page_remove_rmap with the ptp_clear_flush above.
 			 * Those stores are ordered by (if nothing else,)
 			 * the barrier present in the atomic_add_negative
 			 * in page_remove_rmap.
 			 *
 			 * Then the TLB flush in ptep_clear_flush ensures that
 			 * no process can access the old page before the
 			 * decremented mapcount is visible. And the old page
 			 * cannot be reused until after the decremented
 			 * mapcount is visible. So transitively, TLBs to
 			 * old page will be flushed before it can be reused.
 			 */
 			page_remove_rmap(old_page);
 		}
 		/* Free the old page.. */
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
 	} else
 		mem_cgroup_uncharge_page(new_page);
 	if (new_page)
 		page_cache_release(new_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (mmun_end > mmun_start)
 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	if (old_page) {
 		/*
 		 * Don't let another task, with possibly unlocked vma,
 		 * keep the mlocked page.
 		 */
 		if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
 			lock_page(old_page);	/* LRU manipulation */
 			munlock_vma_page(old_page);
 			unlock_page(old_page);
 		}
 		page_cache_release(old_page);
 	}
 	return ret;
 oom_free_new:
 	page_cache_release(new_page);
 oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
 unwritable_page:
 	page_cache_release(old_page);
 	return ret;
 }
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
 	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 static inline void unmap_mapping_range_tree(struct rb_root *root,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
 	pgoff_t vba, vea, zba, zea;
 	vma_interval_tree_foreach(vma, root,
 			details->first_index, details->last_index) {
 		vba = vma->vm_pgoff;
 		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
 		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
 		zba = details->first_index;
 		if (zba < vba)
 			zba = vba;
 		zea = details->last_index;
 		if (zea > vea)
 			zea = vea;
 		unmap_mapping_range_vma(vma,
 			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
 			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
 				details);
 	}
 }
 static inline void unmap_mapping_range_list(struct list_head *head,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
 	/*
 	 * In nonlinear VMAs there is no correspondence between virtual address
 	 * offset and file offset.  So we must perform an exhaustive search
 	 * across *all* the pages in each nonlinear VMA, not just the pages
 	 * whose virtual address lies outside the file truncation point.
 	 */
 	list_for_each_entry(vma, head, shared.nonlinear) {
 		details->nonlinear_vma = vma;
 		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
 	}
 }
 /**
  * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
  * boundary.  Note that this is different from truncate_pagecache(), which
  * must keep the partial page.  In contrast, we must get rid of
  * partial pages.
  * @holelen: size of prospective hole in bytes.  This will be rounded
  * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
  * end of the file.
  * @even_cows: 1 when truncating a file, unmap even private COWed pages;
  * but 0 when invalidating pagecache, don't throw away private data.
  */
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows)
 {
 	struct zap_details details;
 	pgoff_t hba = holebegin >> PAGE_SHIFT;
 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/* Check for overflow. */
 	if (sizeof(holelen) > sizeof(hlen)) {
 		long long holeend =
 			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (holeend & ~(long long)ULONG_MAX)
 			hlen = ULONG_MAX - hba + 1;
 	}
 	details.check_mapping = even_cows? NULL: mapping;
 	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 	mutex_lock(&mapping->i_mmap_mutex);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	mutex_unlock(&mapping->i_mmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page, *swapcache;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
 	struct mem_cgroup *ptr;
 	int exclusive = 0;
 	int ret = 0;
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(mm, pmd, address);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
 			print_bad_pte(vma, address, orig_pte, NULL);
 			ret = VM_FAULT_SIGBUS;
 		}
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 		mem_cgroup_count_vm_event(mm, PGMAJFAULT);
 	} else if (PageHWPoison(page)) {
 		/*
 		 * hwpoisoned dirty swapcache pages are kept for killing
 		 * owner processes (which may be unknown at hwpoison time)
 		 */
 		ret = VM_FAULT_HWPOISON;
 		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 		swapcache = page;
 		goto out_release;
 	}
 	swapcache = page;
 	locked = lock_page_or_retry(page, mm, flags);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	if (!locked) {
 		ret |= VM_FAULT_RETRY;
 		goto out_release;
 	}
 	/*
 	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
 	 * release the swapcache from under us.  The page pin, and pte_same
 	 * test below, are not enough to exclude that.  Even if it is still
 	 * swapcache, we need to check that the page's swap has not changed.
 	 */
 	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
 		goto out_page;
 	page = ksm_might_need_to_copy(page, vma, address);
 	if (unlikely(!page)) {
 		ret = VM_FAULT_OOM;
 		page = swapcache;
 		goto out_page;
 	}
 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
 	}
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 	if (unlikely(!PageUptodate(page))) {
 		ret = VM_FAULT_SIGBUS;
 		goto out_nomap;
 	}
 	/*
 	 * The page isn't present yet, go ahead with the fault.
 	 *
 	 * Be careful about the sequence of operations here.
 	 * To get its accounting right, reuse_swap_page() must be called
 	 * while the page is counted on swap but not yet in mapcount i.e.
 	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
 	 * must be called after the swap_free(), or it will never succeed.
 	 * Because delete_from_swap_page() may be called by reuse_swap_page(),
 	 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
 	 * in page->private. In this case, a record in swap_cgroup  is silently
 	 * discarded at swap_free().
 	 */
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	dec_mm_counter_fast(mm, MM_SWAPENTS);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
 		ret |= VM_FAULT_WRITE;
 		exclusive = 1;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	if (page == swapcache)
 		do_page_add_anon_rmap(page, vma, address, exclusive);
 	else /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, address);
 	/* It's better to call commit-charge after rmap is established */
 	mem_cgroup_commit_charge_swapin(page, ptr);
 	swap_free(entry);
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (page != swapcache) {
 		/*
 		 * Hold the lock to avoid the swap entry to be reused
 		 * until we take the PT lock for the pte_same() check
 		 * (to avoid false positives from pte_same). For
 		 * further safety release the lock after the swap_free
 		 * so that the swap count won't change under a
 		 * parallel locked swapcache.
 		 */
 		unlock_page(swapcache);
 		page_cache_release(swapcache);
 	}
 	if (flags & FAULT_FLAG_WRITE) {
 		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
 		goto out;
 	}
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, page_table);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
 	return ret;
 out_nomap:
 	mem_cgroup_cancel_charge_swapin(ptr);
 	pte_unmap_unlock(page_table, ptl);
 out_page:
 	unlock_page(page);
 out_release:
 	page_cache_release(page);
 	if (page != swapcache) {
 		unlock_page(swapcache);
 		page_cache_release(swapcache);
 	}
 	return ret;
 }
 /*
  * This is like a special single-page "expand_{down|up}wards()",
  * except we must first make sure that 'address{-|+}PAGE_SIZE'
  * doesn't hit another vma.
  */
 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
 {
 	address &= PAGE_MASK;
 	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
 		struct vm_area_struct *prev = vma->vm_prev;
 		/*
 		 * Is there a mapping abutting this one below?
 		 *
 		 * That's only ok if it's the same stack mapping
 		 * that has gotten split..
 		 */
 		if (prev && prev->vm_end == address)
 			return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
 		expand_downwards(vma, address - PAGE_SIZE);
 	}
 	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
 		struct vm_area_struct *next = vma->vm_next;
 		/* As VM_GROWSDOWN but s/below/above/ */
 		if (next && next->vm_start == address + PAGE_SIZE)
 			return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
 		expand_upwards(vma, address + PAGE_SIZE);
 	}
 	return 0;
 }
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags)
 {
 	struct page *page;
 	spinlock_t *ptl;
 	pte_t entry;
 	pte_unmap(page_table);
 	/* Check if we need to add a guard page to the stack */
 	if (check_stack_guard_page(vma, address) < 0)
 		return VM_FAULT_SIGBUS;
 	/* Use the zero-page for reads */
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto unlock;
 		goto setpte;
 	}
 	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	page = alloc_zeroed_user_highpage_movable(vma, address);
 	if (!page)
 		goto oom;
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceeding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
 	__SetPageUptodate(page);
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!pte_none(*page_table))
 		goto release;
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, page_table);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 release:
 	mem_cgroup_uncharge_page(page);
 	page_cache_release(page);
 	goto unlock;
 oom_free_page:
 	page_cache_release(page);
 oom:
 	return VM_FAULT_OOM;
 }
 /*
  * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
  * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
  * the next page fault.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte neither mapped nor locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	pte_t *page_table;
 	spinlock_t *ptl;
 	struct page *page;
 	struct page *cow_page;
 	pte_t entry;
 	int anon = 0;
 	struct page *dirty_page = NULL;
 	struct vm_fault vmf;
 	int ret;
 	int page_mkwrite = 0;
 	/*
 	 * If we do COW later, allocate page befor taking lock_page()
 	 * on the file cache page. This will reduce lock holding time.
 	 */
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		if (unlikely(anon_vma_prepare(vma)))
 			return VM_FAULT_OOM;
 		cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!cow_page)
 			return VM_FAULT_OOM;
 		if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
 			page_cache_release(cow_page);
 			return VM_FAULT_OOM;
 		}
 	} else
 		cow_page = NULL;
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.page = NULL;
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
 			    VM_FAULT_RETRY)))
 		goto uncharge_out;
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
 			unlock_page(vmf.page);
 		ret = VM_FAULT_HWPOISON;
 		goto uncharge_out;
 	}
 	/*
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
 	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
 	/*
 	 * Should we do an early C-O-W break?
 	 */
 	page = vmf.page;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
 			page = cow_page;
 			anon = 1;
 			copy_user_highpage(page, vmf.page, address, vma);
 			__SetPageUptodate(page);
 		} else {
 			/*
 			 * If the page will be shareable, see if the backing
 			 * address space wants to know that the page is about
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
 				int tmp;
 				unlock_page(page);
 				vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
 				if (unlikely(tmp &
 					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
 					ret = tmp;
 					goto unwritable_page;
 				}
 				if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
 					lock_page(page);
 					if (!page->mapping) {
 						ret = 0; /* retry the fault */
 						unlock_page(page);
 						goto unwritable_page;
 					}
 				} else
 					VM_BUG_ON(!PageLocked(page));
 				page_mkwrite = 1;
 			}
 		}
 	}
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
 	 * for other architectures too.
 	 *
 	 * Note that if FAULT_FLAG_WRITE is set, we either now have
 	 * an exclusive copy of the page, or this is a shared mapping,
 	 * so we can make it writable and dirty to avoid having to
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter_fast(mm, MM_FILEPAGES);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
 				get_page(dirty_page);
 			}
 		}
 		set_pte_at(mm, address, page_table, entry);
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, page_table);
 	} else {
 		if (cow_page)
 			mem_cgroup_uncharge_page(cow_page);
 		if (anon)
 			page_cache_release(page);
 		else
 			anon = 1; /* no anon but release faulted_page */
 	}
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
 		struct address_space *mapping = page->mapping;
 		int dirtied = 0;
 		if (set_page_dirty(dirty_page))
 			dirtied = 1;
 		unlock_page(dirty_page);
 		put_page(dirty_page);
 		if ((dirtied || page_mkwrite) && mapping) {
 			/*
 			 * Some device drivers do not set page.mapping but still
 			 * dirty their pages
 			 */
 			balance_dirty_pages_ratelimited(mapping);
 		}
 		/* file_update_time outside page_lock */
 		if (vma->vm_file && !page_mkwrite)
 			file_update_time(vma->vm_file);
 	} else {
 		unlock_page(vmf.page);
 		if (anon)
 			page_cache_release(vmf.page);
 	}
 	return ret;
 unwritable_page:
 	page_cache_release(page);
 	return ret;
 uncharge_out:
 	/* fs's fault handler get error */
 	if (cow_page) {
 		mem_cgroup_uncharge_page(cow_page);
 		page_cache_release(cow_page);
 	}
 	return ret;
 }
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 /*
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff;
 	flags |= FAULT_FLAG_NONLINEAR;
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return 0;
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
 		print_bad_pte(vma, address, orig_pte, NULL);
 		return VM_FAULT_SIGBUS;
 	}
 	pgoff = pte_to_pgoff(orig_pte);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 				unsigned long addr, int current_nid)
 {
 	get_page(page);
 	count_vm_numa_event(NUMA_HINT_FAULTS);
 	if (current_nid == numa_node_id())
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 	return mpol_misplaced(page, vma, addr);
 }
 int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 {
 	struct page *page = NULL;
 	spinlock_t *ptl;
 	int current_nid = -1;
 	int target_nid;
 	bool migrated = false;
 	/*
 	* The "pte" at this point cannot be used safely without
 	* validation through pte_unmap_same(). It's of NUMA type but
 	* the pfn may be screwed if the read is non atomic.
 	*
 	* ptep_modify_prot_start is not called as this is clearing
 	* the _PAGE_NUMA bit and it is not really expected that there
 	* would be concurrent hardware modifications to the PTE.
 	*/
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*ptep, pte))) {
 		pte_unmap_unlock(ptep, ptl);
 		goto out;
 	}
 	pte = pte_mknonnuma(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);
 	page = vm_normal_page(vma, addr, pte);
 	if (!page) {
 		pte_unmap_unlock(ptep, ptl);
 		return 0;
 	}
 	current_nid = page_to_nid(page);
 	target_nid = numa_migrate_prep(page, vma, addr, current_nid);
 	pte_unmap_unlock(ptep, ptl);
 	if (target_nid == -1) {
 		/*
 		 * Account for the fault against the current node if it not
 		 * being replaced regardless of where the page is located.
 		 */
 		current_nid = numa_node_id();
 		put_page(page);
 		goto out;
 	}
 	/* Migrate to the requested node */
 	migrated = migrate_misplaced_page(page, target_nid);
 	if (migrated)
 		current_nid = target_nid;
 out:
 	if (current_nid != -1)
 		task_numa_fault(current_nid, 1, migrated);
 	return 0;
 }
 /* NUMA hinting page fault entry point for regular pmds */
 #ifdef CONFIG_NUMA_BALANCING
 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		     unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t pmd;
 	pte_t *pte, *orig_pte;
 	unsigned long _addr = addr & PMD_MASK;
 	unsigned long offset;
 	spinlock_t *ptl;
 	bool numa = false;
 	int local_nid = numa_node_id();
 	spin_lock(&mm->page_table_lock);
 	pmd = *pmdp;
 	if (pmd_numa(pmd)) {
 		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
 		numa = true;
 	}
 	spin_unlock(&mm->page_table_lock);
 	if (!numa)
 		return 0;
 	/* we're in a page fault so some vma must be in the range */
 	BUG_ON(!vma);
 	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
 	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
 	VM_BUG_ON(offset >= PMD_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
 	pte += offset >> PAGE_SHIFT;
 	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
 		pte_t pteval = *pte;
 		struct page *page;
 		int curr_nid = local_nid;
 		int target_nid;
 		bool migrated;
 		if (!pte_present(pteval))
 			continue;
 		if (!pte_numa(pteval))
 			continue;
 		if (addr >= vma->vm_end) {
 			vma = find_vma(mm, addr);
 			/* there's a pte present so there must be a vma */
 			BUG_ON(!vma);
 			BUG_ON(addr < vma->vm_start);
 		}
 		if (pte_numa(pteval)) {
 			pteval = pte_mknonnuma(pteval);
 			set_pte_at(mm, addr, pte, pteval);
 		}
 		page = vm_normal_page(vma, addr, pteval);
 		if (unlikely(!page))
 			continue;
 		/* only check non-shared pages */
 		if (unlikely(page_mapcount(page) != 1))
 			continue;
 		/*
 		 * Note that the NUMA fault is later accounted to either
 		 * the node that is currently running or where the page is
 		 * migrated to.
 		 */
 		curr_nid = local_nid;
 		target_nid = numa_migrate_prep(page, vma, addr,
 					       page_to_nid(page));
 		if (target_nid == -1) {
 			put_page(page);
 			continue;
 		}
 		/* Migrate to the requested node */
 		pte_unmap_unlock(pte, ptl);
 		migrated = migrate_misplaced_page(page, target_nid);
 		if (migrated)
 			curr_nid = target_nid;
 		task_numa_fault(curr_nid, 1, migrated);
 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	}
 	pte_unmap_unlock(orig_pte, ptl);
 	return 0;
 }
 #else
 static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		     unsigned long addr, pmd_t *pmdp)
 {
 	BUG();
 	return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
  * RISC architectures).  The early dirtying is also good on the i386.
  *
  * There is also a hook called "update_mmu_cache()" that architectures
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 int handle_pte_fault(struct mm_struct *mm,
 		     struct vm_area_struct *vma, unsigned long address,
 		     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
 	pte_t entry;
 	spinlock_t *ptl;
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
 						pte, pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, flags);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
 					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
 	}
 	if (pte_numa(entry))
 		return do_numa_page(mm, vma, address, entry, pte, pmd);
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vma, address, pte);
 	} else {
 		/*
 		 * This is needed only for protection faults but the arch code
 		 * is not yet telling us if this is a protection fault or not.
 		 * This still avoids useless tlb flushes for .text page faults
 		 * with threads.
 		 */
 		if (flags & FAULT_FLAG_WRITE)
 			flush_tlb_fix_spurious_fault(vma, address);
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;
 }
 /*
  * By the time we get here, we already hold the mm semaphore
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	__set_current_state(TASK_RUNNING);
 	count_vm_event(PGFAULT);
 	mem_cgroup_count_vm_event(mm, PGFAULT);
 	/* do counter updates before entering really critical section. */
 	check_sync_rss_stat(current);
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		return VM_FAULT_OOM;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
 		if (!vma->vm_ops)
 			return do_huge_pmd_anonymous_page(mm, vma, address,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
 		int ret;
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			unsigned int dirty = flags & FAULT_FLAG_WRITE;
 			/*
 			 * If the pmd is splitting, return and retry the
 			 * the fault.  Alternative: wait until the split
 			 * is done, and goto retry.
 			 */
 			if (pmd_trans_splitting(orig_pmd))
 				return 0;
 			if (pmd_numa(orig_pmd))
 				return do_huge_pmd_numa_page(mm, vma, address,
 							     orig_pmd, pmd);
 			if (dirty && !pmd_write(orig_pmd)) {
 				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
 							  orig_pmd);
 				/*
 				 * If COW results in an oom, the huge pmd will
 				 * have been split, so retry the fault on the
 				 * pte for a smaller charge.
 				 */
 				if (unlikely(ret & VM_FAULT_OOM))
 					goto retry;
 				return ret;
 			} else {
 				huge_pmd_set_accessed(mm, vma, address, pmd,
 						      orig_pmd, dirty);
 			}
 			return 0;
 		}
 	}
 	if (pmd_numa(*pmd))
 		return do_pmd_numa_page(mm, vma, address, pmd);
 	/*
 	 * Use __pte_alloc instead of pte_alloc_map, because we can't
 	 * run pte_offset_map on the pmd, if an huge pmd could
 	 * materialize from under us from a different thread.
 	 */
 	if (unlikely(pmd_none(*pmd)) &&
 	    unlikely(__pte_alloc(mm, vma, pmd, address)))
 		return VM_FAULT_OOM;
 	/* if an huge pmd materialized from under us just retry later */
 	if (unlikely(pmd_trans_huge(*pmd)))
 		return 0;
 	/*
 	 * A regular pmd is established and it can't morph into a huge pmd
 	 * from under us anymore at this point because we hold the mmap_sem
 	 * read mode and khugepaged takes it in write mode. So now it's
 	 * safe to run pte_offset_map().
 	 */
 	pte = pte_offset_map(pmd, address);
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
  * We've already handled the fast-path in-line.
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
 	pud_t *new = pud_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 	smp_wmb(); /* See comment in __pte_alloc */
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(mm, new);
 	else
 		pgd_populate(mm, pgd, new);
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
 #ifndef __PAGETABLE_PMD_FOLDED
 /*
  * Allocate page middle directory.
  * We've already handled the fast-path in-line.
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
 	pmd_t *new = pmd_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 	smp_wmb(); /* See comment in __pte_alloc */
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(mm, new);
 	else
 		pud_populate(mm, pud, new);
 #else
 	if (pgd_present(*pud))		/* Another has populated it */
 		pmd_free(mm, new);
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 #if !defined(__HAVE_ARCH_GATE_AREA)
 #if defined(AT_SYSINFO_EHDR)
 static struct vm_area_struct gate_vma;
 static int __init gate_vma_init(void)
 {
 	gate_vma.vm_mm = NULL;
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
 	gate_vma.vm_page_prot = __P101;
 	return 0;
 }
 __initcall(gate_vma_init);
 #endif
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 #ifdef AT_SYSINFO_EHDR
 	return &gate_vma;
 #else
 	return NULL;
 #endif
 }
 int in_gate_area_no_mm(unsigned long addr)
 {
 #ifdef AT_SYSINFO_EHDR
 	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
 		return 1;
 #endif
 	return 0;
 }
 #endif	/* __HAVE_ARCH_GATE_AREA */
 static int __follow_pte(struct mm_struct *mm, unsigned long address,
 		pte_t **ptepp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		goto out;
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
 		goto out;
 	pmd = pmd_offset(pud, address);
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
 		goto out;
 	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
 	if (pmd_huge(*pmd))
 		goto out;
 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
 	if (!ptep)
 		goto out;
 	if (!pte_present(*ptep))
 		goto unlock;
 	*ptepp = ptep;
 	return 0;
 unlock:
 	pte_unmap_unlock(ptep, *ptlp);
 out:
 	return -EINVAL;
 }
 static inline int follow_pte(struct mm_struct *mm, unsigned long address,
 			     pte_t **ptepp, spinlock_t **ptlp)
 {
 	int res;
 	/* (void) is needed to make gcc happy */
 	(void) __cond_lock(*ptlp,
 			   !(res = __follow_pte(mm, address, ptepp, ptlp)));
 	return res;
 }
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
  * @address: user virtual address
  * @pfn: location to store found PFN
  *
  * Only IO mappings and raw PFN mappings are allowed.
  *
  * Returns zero and the pfn at @pfn on success, -ve otherwise.
  */
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn)
 {
 	int ret = -EINVAL;
 	spinlock_t *ptl;
 	pte_t *ptep;
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		return ret;
 	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
 	if (ret)
 		return ret;
 	*pfn = pte_pfn(*ptep);
 	pte_unmap_unlock(ptep, ptl);
 	return 0;
 }
 EXPORT_SYMBOL(follow_pfn);
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
 		unsigned long *prot, resource_size_t *phys)
 {
 	int ret = -EINVAL;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		goto out;
 	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
 		goto out;
 	pte = *ptep;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 	*prot = pgprot_val(pte_pgprot(pte));
 	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
 	ret = 0;
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
 	return ret;
 }
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write)
 {
 	resource_size_t phys_addr;
 	unsigned long prot = 0;
 	void __iomem *maddr;
 	int offset = addr & (PAGE_SIZE-1);
 	if (follow_phys(vma, addr, write, &prot, &phys_addr))
 		return -EINVAL;
 	maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
 	if (write)
 		memcpy_toio(maddr + offset, buf, len);
 	else
 		memcpy_fromio(buf, maddr + offset, len);
 	iounmap(maddr);
 	return len;
 }
 #endif
 /*
  * Access another process' address space as given in mm.  If non-NULL, use the
  * given task for page fault accounting.
  */
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
 	while (len) {
 		int bytes, ret, offset;
 		void *maddr;
 		struct page *page = NULL;
 		ret = get_user_pages(tsk, mm, addr, 1,
 				write, 1, &page, &vma);
 		if (ret <= 0) {
 			/*
 			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
 			 * we can access using slightly different code.
 			 */
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 			vma = find_vma(mm, addr);
 			if (!vma || vma->vm_start > addr)
 				break;
 			if (vma->vm_ops && vma->vm_ops->access)
 				ret = vma->vm_ops->access(vma, addr, buf,
 							  len, write);
 			if (ret <= 0)
 #endif
 				break;
 			bytes = ret;
 		} else {
 			bytes = len;
 			offset = addr & (PAGE_SIZE-1);
 			if (bytes > PAGE_SIZE-offset)
 				bytes = PAGE_SIZE-offset;
 			maddr = kmap(page);
 			if (write) {
 				copy_to_user_page(vma, page, addr,
 						  maddr + offset, buf, bytes);
 				set_page_dirty_lock(page);
 			} else {
 				copy_from_user_page(vma, page, addr,
 						    buf, maddr + offset, bytes);
 			}
 			kunmap(page);
 			page_cache_release(page);
 		}
 		len -= bytes;
 		buf += bytes;
 		addr += bytes;
 	}
 	up_read(&mm->mmap_sem);
 	return buf - old_buf;
 }
 /**
  * access_remote_vm - access another process' address space
  * @mm:		the mm_struct of the target address space
  * @addr:	start address to access
  * @buf:	source or destination buffer
  * @len:	number of bytes to transfer
  * @write:	whether the access is a write
  *
  * The caller must hold a reference on @mm.
  */
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write)
 {
 	return __access_remote_vm(NULL, mm, addr, buf, len, write);
 }
 /*
  * Access another process' address space.
  * Source/target buffer must be kernel space,
  * Do not walk the page table directly, use get_user_pages
  */
 int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, int write)
 {
 	struct mm_struct *mm;
 	int ret;
 	mm = get_task_mm(tsk);
 	if (!mm)
 		return 0;
 	ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
 	mmput(mm);
 	return ret;
 }
 /*
  * Print the name of a VMA.
  */
 void print_vma_addr(char *prefix, unsigned long ip)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	/*
 	 * Do not print if we are in atomic
 	 * contexts (in exception stacks, etc.):
 	 */
 	if (preempt_count())
 		return;
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, ip);
 	if (vma && vma->vm_file) {
 		struct file *f = vma->vm_file;
 		char *buf = (char *)__get_free_page(GFP_KERNEL);
 		if (buf) {
 			char *p;
 			p = d_path(&f->f_path, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
 			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
 					vma->vm_start,
 					vma->vm_end - vma->vm_start);
 			free_page((unsigned long)buf);
 		}
 	}
 	up_read(&mm->mmap_sem);
 }
 #ifdef CONFIG_PROVE_LOCKING
 void might_fault(void)
 {
 	/*
 	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
 	 * holding the mmap_sem, this is safe because kernel memory doesn't
 	 * get paged out, therefore we'll never actually fault, and the
 	 * below annotations will generate false positives.
 	 */
 	if (segment_eq(get_fs(), KERNEL_DS))
 		return;
 	might_sleep();
 	/*
 	 * it would be nicer only to annotate paths which are not under
 	 * pagefault_disable, however that requires a larger audit and
 	 * providing helpers like get_user_atomic.
 	 */
 	if (!in_atomic() && current->mm)
 		might_lock_read(&current->mm->mmap_sem);
 }
 EXPORT_SYMBOL(might_fault);
 #endif
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 static void clear_gigantic_page(struct page *page,
 				unsigned long addr,
 				unsigned int pages_per_huge_page)
 {
 	int i;
 	struct page *p = page;
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page;
 	     i++, p = mem_map_next(p, page, i)) {
 		cond_resched();
 		clear_user_highpage(p, addr + i * PAGE_SIZE);
 	}
 }
 void clear_huge_page(struct page *page,
 		     unsigned long addr, unsigned int pages_per_huge_page)
 {
 	int i;
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
 		clear_gigantic_page(page, addr, pages_per_huge_page);
 		return;
 	}
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page; i++) {
 		cond_resched();
 		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
 	}
 }
 static void copy_user_gigantic_page(struct page *dst, struct page *src,
 				    unsigned long addr,
 				    struct vm_area_struct *vma,
 				    unsigned int pages_per_huge_page)
 {
 	int i;
 	struct page *dst_base = dst;
 	struct page *src_base = src;
 	for (i = 0; i < pages_per_huge_page; ) {
 		cond_resched();
 		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
 		i++;
 		dst = mem_map_next(dst, dst_base, i);
 		src = mem_map_next(src, src_base, i);
 	}
 }
 void copy_user_huge_page(struct page *dst, struct page *src,
 			 unsigned long addr, struct vm_area_struct *vma,
 			 unsigned int pages_per_huge_page)
 {
 	int i;
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
 		copy_user_gigantic_page(dst, src, addr, vma,
 					pages_per_huge_page);
 		return;
 	}
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page; i++) {
 		cond_resched();
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */