Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* mm/mmap.c

2

* mm/mmap.c

3

*

3

*

4

* Written by obz.

4

* Written by obz.

5

*

5

*

6

* Address space accounting code <alan@lxorguk.ukuu.org.uk>

6

* Address space accounting code <alan@lxorguk.ukuu.org.uk>

7

*/

7

*/

8

9

#include <linux/kernel.h>

9

#include <linux/kernel.h>

10

#include <linux/slab.h>

10

#include <linux/slab.h>

11

#include <linux/backing-dev.h>

11

#include <linux/backing-dev.h>

12

#include <linux/mm.h>

12

#include <linux/mm.h>

13

#include <linux/shm.h>

13

#include <linux/shm.h>

14

#include <linux/mman.h>

14

#include <linux/mman.h>

15

#include <linux/pagemap.h>

15

#include <linux/pagemap.h>

16

#include <linux/swap.h>

16

#include <linux/swap.h>

17

#include <linux/syscalls.h>

17

#include <linux/syscalls.h>

18

#include <linux/capability.h>

18

#include <linux/capability.h>

19

#include <linux/init.h>

19

#include <linux/init.h>

20

#include <linux/file.h>

20

#include <linux/file.h>

21

#include <linux/fs.h>

21

#include <linux/fs.h>

22

#include <linux/personality.h>

22

#include <linux/personality.h>

23

#include <linux/security.h>

23

#include <linux/security.h>

24

#include <linux/hugetlb.h>

24

#include <linux/hugetlb.h>

25

#include <linux/profile.h>

25

#include <linux/profile.h>

26

#include <linux/export.h>

26

#include <linux/export.h>

27

#include <linux/mount.h>

27

#include <linux/mount.h>

28

#include <linux/mempolicy.h>

28

#include <linux/mempolicy.h>

29

#include <linux/rmap.h>

29

#include <linux/rmap.h>

30

#include <linux/mmu_notifier.h>

30

#include <linux/mmu_notifier.h>

31

#include <linux/perf_event.h>

31

#include <linux/perf_event.h>

32

#include <linux/audit.h>

32

#include <linux/audit.h>

33

#include <linux/khugepaged.h>

33

#include <linux/khugepaged.h>

34

#include <linux/uprobes.h>

34

#include <linux/uprobes.h>

35

#include <linux/rbtree_augmented.h>

35

#include <linux/rbtree_augmented.h>

36

#include <linux/sched/sysctl.h>

36

#include <linux/sched/sysctl.h>

37

#include <linux/notifier.h>

37

#include <linux/notifier.h>

38

#include <linux/memory.h>

38

#include <linux/memory.h>

39

40

#include <asm/uaccess.h>

40

#include <asm/uaccess.h>

41

#include <asm/cacheflush.h>

41

#include <asm/cacheflush.h>

42

#include <asm/tlb.h>

42

#include <asm/tlb.h>

43

#include <asm/mmu_context.h>

43

#include <asm/mmu_context.h>

44

45

#include "internal.h"

45

#include "internal.h"

46

47

#ifndef arch_mmap_check

47

#ifndef arch_mmap_check

48

#define arch_mmap_check(addr, len, flags) (0)

48

#define arch_mmap_check(addr, len, flags) (0)

49

#endif

49

#endif

50

51

#ifndef arch_rebalance_pgtables

51

#ifndef arch_rebalance_pgtables

52

#define arch_rebalance_pgtables(addr, len) (addr)

52

#define arch_rebalance_pgtables(addr, len) (addr)

53

#endif

53

#endif

54

55

static void unmap_region(struct mm_struct *mm,

55

static void unmap_region(struct mm_struct *mm,

56

struct vm_area_struct *vma, struct vm_area_struct *prev,

56

struct vm_area_struct *vma, struct vm_area_struct *prev,

57

unsigned long start, unsigned long end);

57

unsigned long start, unsigned long end);

58

59

/* description of effects of mapping type and prot in current implementation.

59

/* description of effects of mapping type and prot in current implementation.

60

* this is due to the limited x86 page protection hardware. The expected

60

* this is due to the limited x86 page protection hardware. The expected

61

* behavior is in parens:

61

* behavior is in parens:

62

*

62

*

63

* map_type prot

63

* map_type prot

64

* PROT_NONE PROT_READ PROT_WRITE PROT_EXEC

64

* PROT_NONE PROT_READ PROT_WRITE PROT_EXEC

65

* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes

65

* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes

66

* w: (no) no w: (no) no w: (yes) yes w: (no) no

66

* w: (no) no w: (no) no w: (yes) yes w: (no) no

67

* x: (no) no x: (no) yes x: (no) yes x: (yes) yes

67

* x: (no) no x: (no) yes x: (no) yes x: (yes) yes

68

*

68

*

69

* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes

69

* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes

70

* w: (no) no w: (no) no w: (copy) copy w: (no) no

70

* w: (no) no w: (no) no w: (copy) copy w: (no) no

71

* x: (no) no x: (no) yes x: (no) yes x: (yes) yes

71

* x: (no) no x: (no) yes x: (no) yes x: (yes) yes

72

*

72

*

73

*/

73

*/

74

pgprot_t protection_map[16] = {

74

pgprot_t protection_map[16] = {

75

__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,

75

__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,

76

__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111

76

__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111

77

};

77

};

78

79

pgprot_t vm_get_page_prot(unsigned long vm_flags)

79

pgprot_t vm_get_page_prot(unsigned long vm_flags)

80

{

80

{

81

return __pgprot(pgprot_val(protection_map[vm_flags &

81

return __pgprot(pgprot_val(protection_map[vm_flags &

82

(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |

82

(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |

83

pgprot_val(arch_vm_get_page_prot(vm_flags)));

83

pgprot_val(arch_vm_get_page_prot(vm_flags)));

84

}

84

}

85

EXPORT_SYMBOL(vm_get_page_prot);

85

EXPORT_SYMBOL(vm_get_page_prot);

86

87

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */

87

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */

88

int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */

88

int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */

89

int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;

89

int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;

90

unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */

90

unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */

91

unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

91

unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

92

/*

92

/*

93

* Make sure vm_committed_as in one cacheline and not cacheline shared with

93

* Make sure vm_committed_as in one cacheline and not cacheline shared with

94

* other variables. It can be updated by several CPUs frequently.

94

* other variables. It can be updated by several CPUs frequently.

95

*/

95

*/

96

struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

96

struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

97

98

/*

98

/*

99

* The global memory commitment made in the system can be a metric

99

* The global memory commitment made in the system can be a metric

100

* that can be used to drive ballooning decisions when Linux is hosted

100

* that can be used to drive ballooning decisions when Linux is hosted

101

* as a guest. On Hyper-V, the host implements a policy engine for dynamically

101

* as a guest. On Hyper-V, the host implements a policy engine for dynamically

102

* balancing memory across competing virtual machines that are hosted.

102

* balancing memory across competing virtual machines that are hosted.

103

* Several metrics drive this policy engine including the guest reported

103

* Several metrics drive this policy engine including the guest reported

104

* memory commitment.

104

* memory commitment.

105

*/

105

*/

106

unsigned long vm_memory_committed(void)

106

unsigned long vm_memory_committed(void)

107

{

107

{

108

return percpu_counter_read_positive(&vm_committed_as);

108

return percpu_counter_read_positive(&vm_committed_as);

109

}

109

}

110

EXPORT_SYMBOL_GPL(vm_memory_committed);

110

EXPORT_SYMBOL_GPL(vm_memory_committed);

111

112

/*

112

/*

113

* Check that a process has enough memory to allocate a new virtual

113

* Check that a process has enough memory to allocate a new virtual

114

* mapping. 0 means there is enough memory for the allocation to

114

* mapping. 0 means there is enough memory for the allocation to

115

* succeed and -ENOMEM implies there is not.

115

* succeed and -ENOMEM implies there is not.

116

*

116

*

117

* We currently support three overcommit policies, which are set via the

117

* We currently support three overcommit policies, which are set via the

118

* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting

118

* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting

119

*

119

*

120

* Strict overcommit modes added 2002 Feb 26 by Alan Cox.

120

* Strict overcommit modes added 2002 Feb 26 by Alan Cox.

121

* Additional code 2002 Jul 20 by Robert Love.

121

* Additional code 2002 Jul 20 by Robert Love.

122

*

122

*

123

* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.

123

* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.

124

*

124

*

125

* Note this is a helper function intended to be used by LSMs which

125

* Note this is a helper function intended to be used by LSMs which

126

* wish to use this logic.

126

* wish to use this logic.

127

*/

127

*/

128

int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)

128

int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)

129

{

129

{

130

unsigned long free, allowed, reserve;

130

unsigned long free, allowed, reserve;

131

132

vm_acct_memory(pages);

132

vm_acct_memory(pages);

133

134

/*

134

/*

135

* Sometimes we want to use more memory than we have

135

* Sometimes we want to use more memory than we have

136

*/

136

*/

137

if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)

137

if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)

138

return 0;

138

return 0;

139

140

if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {

140

if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {

141

free = global_page_state(NR_FREE_PAGES);

141

free = global_page_state(NR_FREE_PAGES);

142

free += global_page_state(NR_FILE_PAGES);

142

free += global_page_state(NR_FILE_PAGES);

143

144

/*

144

/*

145

* shmem pages shouldn't be counted as free in this

145

* shmem pages shouldn't be counted as free in this

146

* case, they can't be purged, only swapped out, and

146

* case, they can't be purged, only swapped out, and

147

* that won't affect the overall amount of available

147

* that won't affect the overall amount of available

148

* memory in the system.

148

* memory in the system.

149

*/

149

*/

150

free -= global_page_state(NR_SHMEM);

150

free -= global_page_state(NR_SHMEM);

151

152

free += get_nr_swap_pages();

152

free += get_nr_swap_pages();

153

154

/*

154

/*

155

* Any slabs which are created with the

155

* Any slabs which are created with the

156

* SLAB_RECLAIM_ACCOUNT flag claim to have contents

156

* SLAB_RECLAIM_ACCOUNT flag claim to have contents

157

* which are reclaimable, under pressure. The dentry

157

* which are reclaimable, under pressure. The dentry

158

* cache and most inode caches should fall into this

158

* cache and most inode caches should fall into this

159

*/

159

*/

160

free += global_page_state(NR_SLAB_RECLAIMABLE);

160

free += global_page_state(NR_SLAB_RECLAIMABLE);

161

162

/*

162

/*

163

* Leave reserved pages. The pages are not for anonymous pages.

163

* Leave reserved pages. The pages are not for anonymous pages.

164

*/

164

*/

165

if (free <= totalreserve_pages)

165

if (free <= totalreserve_pages)

166

goto error;

166

goto error;

167

else

167

else

168

free -= totalreserve_pages;

168

free -= totalreserve_pages;

169

170

/*

170

/*

171

* Reserve some for root

171

* Reserve some for root

172

*/

172

*/

173

if (!cap_sys_admin)

173

if (!cap_sys_admin)

174

free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

174

free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

175

176

if (free > pages)

176

if (free > pages)

177

return 0;

177

return 0;

178

179

goto error;

179

goto error;

180

}

180

}

181

182

allowed = (totalram_pages - hugetlb_total_pages())

182

allowed = (totalram_pages - hugetlb_total_pages())

183

* sysctl_overcommit_ratio / 100;

183

* sysctl_overcommit_ratio / 100;

184

/*

184

/*

185

* Reserve some for root

185

* Reserve some for root

186

*/

186

*/

187

if (!cap_sys_admin)

187

if (!cap_sys_admin)

188

allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

188

allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

189

allowed += total_swap_pages;

189

allowed += total_swap_pages;

190

191

/*

191

/*

192

* Don't let a single process grow so big a user can't recover

192

* Don't let a single process grow so big a user can't recover

193

*/

193

*/

194

if (mm) {

194

if (mm) {

195

reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

195

reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

196

allowed -= min(mm->total_vm / 32, reserve);

196

allowed -= min(mm->total_vm / 32, reserve);

197

}

197

}

198

199

if (percpu_counter_read_positive(&vm_committed_as) < allowed)

199

if (percpu_counter_read_positive(&vm_committed_as) < allowed)

200

return 0;

200

return 0;

201

error:

201

error:

202

vm_unacct_memory(pages);

202

vm_unacct_memory(pages);

203

204

return -ENOMEM;

204

return -ENOMEM;

205

}

205

}

206

207

/*

207

/*

208

* Requires inode->i_mapping->i_mmap_mutex

208

* Requires inode->i_mapping->i_mmap_mutex

209

*/

209

*/

210

static void __remove_shared_vm_struct(struct vm_area_struct *vma,

210

static void __remove_shared_vm_struct(struct vm_area_struct *vma,

211

struct file *file, struct address_space *mapping)

211

struct file *file, struct address_space *mapping)

212

{

212

{

213

if (vma->vm_flags & VM_DENYWRITE)

213

if (vma->vm_flags & VM_DENYWRITE)

214

atomic_inc(&file_inode(file)->i_writecount);

214

atomic_inc(&file_inode(file)->i_writecount);

215

if (vma->vm_flags & VM_SHARED)

215

if (vma->vm_flags & VM_SHARED)

216

mapping->i_mmap_writable--;

216

mapping->i_mmap_writable--;

217

218

flush_dcache_mmap_lock(mapping);

218

flush_dcache_mmap_lock(mapping);

219

if (unlikely(vma->vm_flags & VM_NONLINEAR))

219

if (unlikely(vma->vm_flags & VM_NONLINEAR))

220

list_del_init(&vma->shared.nonlinear);

220

list_del_init(&vma->shared.nonlinear);

221

else

221

else

222

vma_interval_tree_remove(vma, &mapping->i_mmap);

222

vma_interval_tree_remove(vma, &mapping->i_mmap);

223

flush_dcache_mmap_unlock(mapping);

223

flush_dcache_mmap_unlock(mapping);

224

}

224

}

225

226

/*

226

/*

227

* Unlink a file-based vm structure from its interval tree, to hide

227

* Unlink a file-based vm structure from its interval tree, to hide

228

* vma from rmap and vmtruncate before freeing its page tables.

228

* vma from rmap and vmtruncate before freeing its page tables.

229

*/

229

*/

230

void unlink_file_vma(struct vm_area_struct *vma)

230

void unlink_file_vma(struct vm_area_struct *vma)

231

{

231

{

232

struct file *file = vma->vm_file;

232

struct file *file = vma->vm_file;

233

234

if (file) {

234

if (file) {

235

struct address_space *mapping = file->f_mapping;

235

struct address_space *mapping = file->f_mapping;

236

mutex_lock(&mapping->i_mmap_mutex);

236

mutex_lock(&mapping->i_mmap_mutex);

237

__remove_shared_vm_struct(vma, file, mapping);

237

__remove_shared_vm_struct(vma, file, mapping);

238

mutex_unlock(&mapping->i_mmap_mutex);

238

mutex_unlock(&mapping->i_mmap_mutex);

239

}

239

}

240

}

240

}

241

242

/*

242

/*

243

* Close a vm structure and free it, returning the next.

243

* Close a vm structure and free it, returning the next.

244

*/

244

*/

245

static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)

245

static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)

246

{

246

{

247

struct vm_area_struct *next = vma->vm_next;

247

struct vm_area_struct *next = vma->vm_next;

248

249

might_sleep();

249

might_sleep();

250

if (vma->vm_ops && vma->vm_ops->close)

250

if (vma->vm_ops && vma->vm_ops->close)

251

vma->vm_ops->close(vma);

251

vma->vm_ops->close(vma);

252

if (vma->vm_file)

252

if (vma->vm_file)

253

fput(vma->vm_file);

253

fput(vma->vm_file);

254

mpol_put(vma_policy(vma));

254

mpol_put(vma_policy(vma));

255

kmem_cache_free(vm_area_cachep, vma);

255

kmem_cache_free(vm_area_cachep, vma);

256

return next;

256

return next;

257

}

257

}

258

259

static unsigned long do_brk(unsigned long addr, unsigned long len);

259

static unsigned long do_brk(unsigned long addr, unsigned long len);

260

261

SYSCALL_DEFINE1(brk, unsigned long, brk)

261

SYSCALL_DEFINE1(brk, unsigned long, brk)

262

{

262

{

263

unsigned long rlim, retval;

263

unsigned long rlim, retval;

264

unsigned long newbrk, oldbrk;

264

unsigned long newbrk, oldbrk;

265

struct mm_struct *mm = current->mm;

265

struct mm_struct *mm = current->mm;

266

unsigned long min_brk;

266

unsigned long min_brk;

267

bool populate;

267

bool populate;

268

269

down_write(&mm->mmap_sem);

269

down_write(&mm->mmap_sem);

270

271

#ifdef CONFIG_COMPAT_BRK

271

#ifdef CONFIG_COMPAT_BRK

272

/*

272

/*

273

* CONFIG_COMPAT_BRK can still be overridden by setting

273

* CONFIG_COMPAT_BRK can still be overridden by setting

274

* randomize_va_space to 2, which will still cause mm->start_brk

274

* randomize_va_space to 2, which will still cause mm->start_brk

275

* to be arbitrarily shifted

275

* to be arbitrarily shifted

276

*/

276

*/

277

if (current->brk_randomized)

277

if (current->brk_randomized)

278

min_brk = mm->start_brk;

278

min_brk = mm->start_brk;

279

else

279

else

280

min_brk = mm->end_data;

280

min_brk = mm->end_data;

281

#else

281

#else

282

min_brk = mm->start_brk;

282

min_brk = mm->start_brk;

283

#endif

283

#endif

284

if (brk < min_brk)

284

if (brk < min_brk)

285

goto out;

285

goto out;

286

287

/*

287

/*

288

* Check against rlimit here. If this check is done later after the test

288

* Check against rlimit here. If this check is done later after the test

289

* of oldbrk with newbrk then it can escape the test and let the data

289

* of oldbrk with newbrk then it can escape the test and let the data

290

* segment grow beyond its set limit the in case where the limit is

290

* segment grow beyond its set limit the in case where the limit is

291

* not page aligned -Ram Gupta

291

* not page aligned -Ram Gupta

292

*/

292

*/

293

rlim = rlimit(RLIMIT_DATA);

293

rlim = rlimit(RLIMIT_DATA);

294

if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +

294

if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +

295

(mm->end_data - mm->start_data) > rlim)

295

(mm->end_data - mm->start_data) > rlim)

296

goto out;

296

goto out;

297

298

newbrk = PAGE_ALIGN(brk);

298

newbrk = PAGE_ALIGN(brk);

299

oldbrk = PAGE_ALIGN(mm->brk);

299

oldbrk = PAGE_ALIGN(mm->brk);

300

if (oldbrk == newbrk)

300

if (oldbrk == newbrk)

301

goto set_brk;

301

goto set_brk;

302

303

/* Always allow shrinking brk. */

303

/* Always allow shrinking brk. */

304

if (brk <= mm->brk) {

304

if (brk <= mm->brk) {

305

if (!do_munmap(mm, newbrk, oldbrk-newbrk))

305

if (!do_munmap(mm, newbrk, oldbrk-newbrk))

306

goto set_brk;

306

goto set_brk;

307

goto out;

307

goto out;

308

}

308

}

309

310

/* Check against existing mmap mappings. */

310

/* Check against existing mmap mappings. */

311

if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))

311

if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))

312

goto out;

312

goto out;

313

314

/* Ok, looks good - let it rip. */

314

/* Ok, looks good - let it rip. */

315

if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)

315

if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)

316

goto out;

316

goto out;

317

318

set_brk:

318

set_brk:

319

mm->brk = brk;

319

mm->brk = brk;

320

populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;

320

populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;

321

up_write(&mm->mmap_sem);

321

up_write(&mm->mmap_sem);

322

if (populate)

322

if (populate)

323

mm_populate(oldbrk, newbrk - oldbrk);

323

mm_populate(oldbrk, newbrk - oldbrk);

324

return brk;

324

return brk;

325

326

out:

326

out:

327

retval = mm->brk;

327

retval = mm->brk;

328

up_write(&mm->mmap_sem);

328

up_write(&mm->mmap_sem);

329

return retval;

329

return retval;

330

}

330

}

331

332

static long vma_compute_subtree_gap(struct vm_area_struct *vma)

332

static long vma_compute_subtree_gap(struct vm_area_struct *vma)

333

{

333

{

334

unsigned long max, subtree_gap;

334

unsigned long max, subtree_gap;

335

max = vma->vm_start;

335

max = vma->vm_start;

336

if (vma->vm_prev)

336

if (vma->vm_prev)

337

max -= vma->vm_prev->vm_end;

337

max -= vma->vm_prev->vm_end;

338

if (vma->vm_rb.rb_left) {

338

if (vma->vm_rb.rb_left) {

339

subtree_gap = rb_entry(vma->vm_rb.rb_left,

339

subtree_gap = rb_entry(vma->vm_rb.rb_left,

340

struct vm_area_struct, vm_rb)->rb_subtree_gap;

340

struct vm_area_struct, vm_rb)->rb_subtree_gap;

341

if (subtree_gap > max)

341

if (subtree_gap > max)

342

max = subtree_gap;

342

max = subtree_gap;

343

}

343

}

344

if (vma->vm_rb.rb_right) {

344

if (vma->vm_rb.rb_right) {

345

subtree_gap = rb_entry(vma->vm_rb.rb_right,

345

subtree_gap = rb_entry(vma->vm_rb.rb_right,

346

struct vm_area_struct, vm_rb)->rb_subtree_gap;

346

struct vm_area_struct, vm_rb)->rb_subtree_gap;

347

if (subtree_gap > max)

347

if (subtree_gap > max)

348

max = subtree_gap;

348

max = subtree_gap;

349

}

349

}

350

return max;

350

return max;

351

}

351

}

352

353

#ifdef CONFIG_DEBUG_VM_RB

353

#ifdef CONFIG_DEBUG_VM_RB

354

static int browse_rb(struct rb_root *root)

354

static int browse_rb(struct rb_root *root)

355

{

355

{

356

int i = 0, j, bug = 0;

356

int i = 0, j, bug = 0;

357

struct rb_node *nd, *pn = NULL;

357

struct rb_node *nd, *pn = NULL;

358

unsigned long prev = 0, pend = 0;

358

unsigned long prev = 0, pend = 0;

359

360

for (nd = rb_first(root); nd; nd = rb_next(nd)) {

360

for (nd = rb_first(root); nd; nd = rb_next(nd)) {

361

struct vm_area_struct *vma;

361

struct vm_area_struct *vma;

362

vma = rb_entry(nd, struct vm_area_struct, vm_rb);

362

vma = rb_entry(nd, struct vm_area_struct, vm_rb);

363

if (vma->vm_start < prev) {

363

if (vma->vm_start < prev) {

364

printk("vm_start %lx prev %lx\n", vma->vm_start, prev);

364

printk("vm_start %lx prev %lx\n", vma->vm_start, prev);

365

bug = 1;

365

bug = 1;

366

}

366

}

367

if (vma->vm_start < pend) {

367

if (vma->vm_start < pend) {

368

printk("vm_start %lx pend %lx\n", vma->vm_start, pend);

368

printk("vm_start %lx pend %lx\n", vma->vm_start, pend);

369

bug = 1;

369

bug = 1;

370

}

370

}

371

if (vma->vm_start > vma->vm_end) {

371

if (vma->vm_start > vma->vm_end) {

372

printk("vm_end %lx < vm_start %lx\n",

372

printk("vm_end %lx < vm_start %lx\n",

373

vma->vm_end, vma->vm_start);

373

vma->vm_end, vma->vm_start);

374

bug = 1;

374

bug = 1;

375

}

375

}

376

if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {

376

if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {

377

printk("free gap %lx, correct %lx\n",

377

printk("free gap %lx, correct %lx\n",

378

vma->rb_subtree_gap,

378

vma->rb_subtree_gap,

379

vma_compute_subtree_gap(vma));

379

vma_compute_subtree_gap(vma));

380

bug = 1;

380

bug = 1;

381

}

381

}

382

i++;

382

i++;

383

pn = nd;

383

pn = nd;

384

prev = vma->vm_start;

384

prev = vma->vm_start;

385

pend = vma->vm_end;

385

pend = vma->vm_end;

386

}

386

}

387

j = 0;

387

j = 0;

388

for (nd = pn; nd; nd = rb_prev(nd))

388

for (nd = pn; nd; nd = rb_prev(nd))

389

j++;

389

j++;

390

if (i != j) {

390

if (i != j) {

391

printk("backwards %d, forwards %d\n", j, i);

391

printk("backwards %d, forwards %d\n", j, i);

392

bug = 1;

392

bug = 1;

393

}

393

}

394

return bug ? -1 : i;

394

return bug ? -1 : i;

395

}

395

}

396

397

static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)

397

static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)

398

{

398

{

399

struct rb_node *nd;

399

struct rb_node *nd;

400

401

for (nd = rb_first(root); nd; nd = rb_next(nd)) {

401

for (nd = rb_first(root); nd; nd = rb_next(nd)) {

402

struct vm_area_struct *vma;

402

struct vm_area_struct *vma;

403

vma = rb_entry(nd, struct vm_area_struct, vm_rb);

403

vma = rb_entry(nd, struct vm_area_struct, vm_rb);

404

BUG_ON(vma != ignore &&

404

BUG_ON(vma != ignore &&

405

vma->rb_subtree_gap != vma_compute_subtree_gap(vma));

405

vma->rb_subtree_gap != vma_compute_subtree_gap(vma));

406

}

406

}

407

}

407

}

408

409

void validate_mm(struct mm_struct *mm)

409

void validate_mm(struct mm_struct *mm)

410

{

410

{

411

int bug = 0;

411

int bug = 0;

412

int i = 0;

412

int i = 0;

413

unsigned long highest_address = 0;

413

unsigned long highest_address = 0;

414

struct vm_area_struct *vma = mm->mmap;

414

struct vm_area_struct *vma = mm->mmap;

415

while (vma) {

415

while (vma) {

416

struct anon_vma_chain *avc;

416

struct anon_vma_chain *avc;

417

vma_lock_anon_vma(vma);

417

vma_lock_anon_vma(vma);

418

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

418

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

419

anon_vma_interval_tree_verify(avc);

419

anon_vma_interval_tree_verify(avc);

420

vma_unlock_anon_vma(vma);

420

vma_unlock_anon_vma(vma);

421

highest_address = vma->vm_end;

421

highest_address = vma->vm_end;

422

vma = vma->vm_next;

422

vma = vma->vm_next;

423

i++;

423

i++;

424

}

424

}

425

if (i != mm->map_count) {

425

if (i != mm->map_count) {

426

printk("map_count %d vm_next %d\n", mm->map_count, i);

426

printk("map_count %d vm_next %d\n", mm->map_count, i);

427

bug = 1;

427

bug = 1;

428

}

428

}

429

if (highest_address != mm->highest_vm_end) {

429

if (highest_address != mm->highest_vm_end) {

430

printk("mm->highest_vm_end %lx, found %lx\n",

430

printk("mm->highest_vm_end %lx, found %lx\n",

431

mm->highest_vm_end, highest_address);

431

mm->highest_vm_end, highest_address);

432

bug = 1;

432

bug = 1;

433

}

433

}

434

i = browse_rb(&mm->mm_rb);

434

i = browse_rb(&mm->mm_rb);

435

if (i != mm->map_count) {

435

if (i != mm->map_count) {

436

printk("map_count %d rb %d\n", mm->map_count, i);

436

printk("map_count %d rb %d\n", mm->map_count, i);

437

bug = 1;

437

bug = 1;

438

}

438

}

439

BUG_ON(bug);

439

BUG_ON(bug);

440

}

440

}

441

#else

441

#else

442

#define validate_mm_rb(root, ignore) do { } while (0)

442

#define validate_mm_rb(root, ignore) do { } while (0)

443

#define validate_mm(mm) do { } while (0)

443

#define validate_mm(mm) do { } while (0)

444

#endif

444

#endif

445

446

RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,

446

RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,

447

unsigned long, rb_subtree_gap, vma_compute_subtree_gap)

447

unsigned long, rb_subtree_gap, vma_compute_subtree_gap)

448

449

/*

449

/*

450

* Update augmented rbtree rb_subtree_gap values after vma->vm_start or

450

* Update augmented rbtree rb_subtree_gap values after vma->vm_start or

451

* vma->vm_prev->vm_end values changed, without modifying the vma's position

451

* vma->vm_prev->vm_end values changed, without modifying the vma's position

452

* in the rbtree.

452

* in the rbtree.

453

*/

453

*/

454

static void vma_gap_update(struct vm_area_struct *vma)

454

static void vma_gap_update(struct vm_area_struct *vma)

455

{

455

{

456

/*

456

/*

457

* As it turns out, RB_DECLARE_CALLBACKS() already created a callback

457

* As it turns out, RB_DECLARE_CALLBACKS() already created a callback

458

* function that does exacltly what we want.

458

* function that does exacltly what we want.

459

*/

459

*/

460

vma_gap_callbacks_propagate(&vma->vm_rb, NULL);

460

vma_gap_callbacks_propagate(&vma->vm_rb, NULL);

461

}

461

}

462

463

static inline void vma_rb_insert(struct vm_area_struct *vma,

463

static inline void vma_rb_insert(struct vm_area_struct *vma,

464

struct rb_root *root)

464

struct rb_root *root)

465

{

465

{

466

/* All rb_subtree_gap values must be consistent prior to insertion */

466

/* All rb_subtree_gap values must be consistent prior to insertion */

467

validate_mm_rb(root, NULL);

467

validate_mm_rb(root, NULL);

468

469

rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);

469

rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);

470

}

470

}

471

472

static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)

472

static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)

473

{

473

{

474

/*

474

/*

475

* All rb_subtree_gap values must be consistent prior to erase,

475

* All rb_subtree_gap values must be consistent prior to erase,

476

* with the possible exception of the vma being erased.

476

* with the possible exception of the vma being erased.

477

*/

477

*/

478

validate_mm_rb(root, vma);

478

validate_mm_rb(root, vma);

479

480

/*

480

/*

481

* Note rb_erase_augmented is a fairly large inline function,

481

* Note rb_erase_augmented is a fairly large inline function,

482

* so make sure we instantiate it only once with our desired

482

* so make sure we instantiate it only once with our desired

483

* augmented rbtree callbacks.

483

* augmented rbtree callbacks.

484

*/

484

*/

485

rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);

485

rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);

486

}

486

}

487

488

/*

488

/*

489

* vma has some anon_vma assigned, and is already inserted on that

489

* vma has some anon_vma assigned, and is already inserted on that

490

* anon_vma's interval trees.

490

* anon_vma's interval trees.

491

*

491

*

492

* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the

492

* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the

493

* vma must be removed from the anon_vma's interval trees using

493

* vma must be removed from the anon_vma's interval trees using

494

* anon_vma_interval_tree_pre_update_vma().

494

* anon_vma_interval_tree_pre_update_vma().

495

*

495

*

496

* After the update, the vma will be reinserted using

496

* After the update, the vma will be reinserted using

497

* anon_vma_interval_tree_post_update_vma().

497

* anon_vma_interval_tree_post_update_vma().

498

*

498

*

499

* The entire update must be protected by exclusive mmap_sem and by

499

* The entire update must be protected by exclusive mmap_sem and by

500

* the root anon_vma's mutex.

500

* the root anon_vma's mutex.

501

*/

501

*/

502

static inline void

502

static inline void

503

anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)

503

anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)

504

{

504

{

505

struct anon_vma_chain *avc;

505

struct anon_vma_chain *avc;

506

507

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

507

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

508

anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);

508

anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);

509

}

509

}

510

511

static inline void

511

static inline void

512

anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)

512

anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)

513

{

513

{

514

struct anon_vma_chain *avc;

514

struct anon_vma_chain *avc;

515

516

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

516

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

517

anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);

517

anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);

518

}

518

}

519

520

static int find_vma_links(struct mm_struct *mm, unsigned long addr,

520

static int find_vma_links(struct mm_struct *mm, unsigned long addr,

521

unsigned long end, struct vm_area_struct **pprev,

521

unsigned long end, struct vm_area_struct **pprev,

522

struct rb_node ***rb_link, struct rb_node **rb_parent)

522

struct rb_node ***rb_link, struct rb_node **rb_parent)

523

{

523

{

524

struct rb_node **__rb_link, *__rb_parent, *rb_prev;

524

struct rb_node **__rb_link, *__rb_parent, *rb_prev;

525

526

__rb_link = &mm->mm_rb.rb_node;

526

__rb_link = &mm->mm_rb.rb_node;

527

rb_prev = __rb_parent = NULL;

527

rb_prev = __rb_parent = NULL;

528

529

while (*__rb_link) {

529

while (*__rb_link) {

530

struct vm_area_struct *vma_tmp;

530

struct vm_area_struct *vma_tmp;

531

532

__rb_parent = *__rb_link;

532

__rb_parent = *__rb_link;

533

vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

533

vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

534

535

if (vma_tmp->vm_end > addr) {

535

if (vma_tmp->vm_end > addr) {

536

/* Fail if an existing vma overlaps the area */

536

/* Fail if an existing vma overlaps the area */

537

if (vma_tmp->vm_start < end)

537

if (vma_tmp->vm_start < end)

538

return -ENOMEM;

538

return -ENOMEM;

539

__rb_link = &__rb_parent->rb_left;

539

__rb_link = &__rb_parent->rb_left;

540

} else {

540

} else {

541

rb_prev = __rb_parent;

541

rb_prev = __rb_parent;

542

__rb_link = &__rb_parent->rb_right;

542

__rb_link = &__rb_parent->rb_right;

543

}

543

}

544

}

544

}

545

546

*pprev = NULL;

546

*pprev = NULL;

547

if (rb_prev)

547

if (rb_prev)

548

*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);

548

*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);

549

*rb_link = __rb_link;

549

*rb_link = __rb_link;

550

*rb_parent = __rb_parent;

550

*rb_parent = __rb_parent;

551

return 0;

551

return 0;

552

}

552

}

553

554

static unsigned long count_vma_pages_range(struct mm_struct *mm,

554

static unsigned long count_vma_pages_range(struct mm_struct *mm,

555

unsigned long addr, unsigned long end)

555

unsigned long addr, unsigned long end)

556

{

556

{

557

unsigned long nr_pages = 0;

557

unsigned long nr_pages = 0;

558

struct vm_area_struct *vma;

558

struct vm_area_struct *vma;

559

560

/* Find first overlaping mapping */

560

/* Find first overlaping mapping */

561

vma = find_vma_intersection(mm, addr, end);

561

vma = find_vma_intersection(mm, addr, end);

562

if (!vma)

562

if (!vma)

563

return 0;

563

return 0;

564

565

nr_pages = (min(end, vma->vm_end) -

565

nr_pages = (min(end, vma->vm_end) -

566

max(addr, vma->vm_start)) >> PAGE_SHIFT;

566

max(addr, vma->vm_start)) >> PAGE_SHIFT;

567

568

/* Iterate over the rest of the overlaps */

568

/* Iterate over the rest of the overlaps */

569

for (vma = vma->vm_next; vma; vma = vma->vm_next) {

569

for (vma = vma->vm_next; vma; vma = vma->vm_next) {

570

unsigned long overlap_len;

570

unsigned long overlap_len;

571

572

if (vma->vm_start > end)

572

if (vma->vm_start > end)

573

break;

573

break;

574

575

overlap_len = min(end, vma->vm_end) - vma->vm_start;

575

overlap_len = min(end, vma->vm_end) - vma->vm_start;

576

nr_pages += overlap_len >> PAGE_SHIFT;

576

nr_pages += overlap_len >> PAGE_SHIFT;

577

}

577

}

578

579

return nr_pages;

579

return nr_pages;

580

}

580

}

581

582

void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,

582

void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,

583

struct rb_node **rb_link, struct rb_node *rb_parent)

583

struct rb_node **rb_link, struct rb_node *rb_parent)

584

{

584

{

585

/* Update tracking information for the gap following the new vma. */

585

/* Update tracking information for the gap following the new vma. */

586

if (vma->vm_next)

586

if (vma->vm_next)

587

vma_gap_update(vma->vm_next);

587

vma_gap_update(vma->vm_next);

588

else

588

else

589

mm->highest_vm_end = vma->vm_end;

589

mm->highest_vm_end = vma->vm_end;

590

591

/*

591

/*

592

* vma->vm_prev wasn't known when we followed the rbtree to find the

592

* vma->vm_prev wasn't known when we followed the rbtree to find the

593

* correct insertion point for that vma. As a result, we could not

593

* correct insertion point for that vma. As a result, we could not

594

* update the vma vm_rb parents rb_subtree_gap values on the way down.

594

* update the vma vm_rb parents rb_subtree_gap values on the way down.

595

* So, we first insert the vma with a zero rb_subtree_gap value

595

* So, we first insert the vma with a zero rb_subtree_gap value

596

* (to be consistent with what we did on the way down), and then

596

* (to be consistent with what we did on the way down), and then

597

* immediately update the gap to the correct value. Finally we

597

* immediately update the gap to the correct value. Finally we

598

* rebalance the rbtree after all augmented values have been set.

598

* rebalance the rbtree after all augmented values have been set.

599

*/

599

*/

600

rb_link_node(&vma->vm_rb, rb_parent, rb_link);

600

rb_link_node(&vma->vm_rb, rb_parent, rb_link);

601

vma->rb_subtree_gap = 0;

601

vma->rb_subtree_gap = 0;

602

vma_gap_update(vma);

602

vma_gap_update(vma);

603

vma_rb_insert(vma, &mm->mm_rb);

603

vma_rb_insert(vma, &mm->mm_rb);

604

}

604

}

605

606

static void __vma_link_file(struct vm_area_struct *vma)

606

static void __vma_link_file(struct vm_area_struct *vma)

607

{

607

{

608

struct file *file;

608

struct file *file;

609

610

file = vma->vm_file;

610

file = vma->vm_file;

611

if (file) {

611

if (file) {

612

struct address_space *mapping = file->f_mapping;

612

struct address_space *mapping = file->f_mapping;

613

614

if (vma->vm_flags & VM_DENYWRITE)

614

if (vma->vm_flags & VM_DENYWRITE)

615

atomic_dec(&file_inode(file)->i_writecount);

615

atomic_dec(&file_inode(file)->i_writecount);

616

if (vma->vm_flags & VM_SHARED)

616

if (vma->vm_flags & VM_SHARED)

617

mapping->i_mmap_writable++;

617

mapping->i_mmap_writable++;

618

619

flush_dcache_mmap_lock(mapping);

619

flush_dcache_mmap_lock(mapping);

620

if (unlikely(vma->vm_flags & VM_NONLINEAR))

620

if (unlikely(vma->vm_flags & VM_NONLINEAR))

621

vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);

621

vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);

622

else

622

else

623

vma_interval_tree_insert(vma, &mapping->i_mmap);

623

vma_interval_tree_insert(vma, &mapping->i_mmap);

624

flush_dcache_mmap_unlock(mapping);

624

flush_dcache_mmap_unlock(mapping);

625

}

625

}

626

}

626

}

627

628

static void

628

static void

629

__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,

629

__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,

630

struct vm_area_struct *prev, struct rb_node **rb_link,

630

struct vm_area_struct *prev, struct rb_node **rb_link,

631

struct rb_node *rb_parent)

631

struct rb_node *rb_parent)

632

{

632

{

633

__vma_link_list(mm, vma, prev, rb_parent);

633

__vma_link_list(mm, vma, prev, rb_parent);

634

__vma_link_rb(mm, vma, rb_link, rb_parent);

634

__vma_link_rb(mm, vma, rb_link, rb_parent);

635

}

635

}

636

637

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,

637

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,

638

struct vm_area_struct *prev, struct rb_node **rb_link,

638

struct vm_area_struct *prev, struct rb_node **rb_link,

639

struct rb_node *rb_parent)

639

struct rb_node *rb_parent)

640

{

640

{

641

struct address_space *mapping = NULL;

641

struct address_space *mapping = NULL;

642

643

if (vma->vm_file)

643

if (vma->vm_file)

644

mapping = vma->vm_file->f_mapping;

644

mapping = vma->vm_file->f_mapping;

645

646

if (mapping)

646

if (mapping)

647

mutex_lock(&mapping->i_mmap_mutex);

647

mutex_lock(&mapping->i_mmap_mutex);

648

649

__vma_link(mm, vma, prev, rb_link, rb_parent);

649

__vma_link(mm, vma, prev, rb_link, rb_parent);

650

__vma_link_file(vma);

650

__vma_link_file(vma);

651

652

if (mapping)

652

if (mapping)

653

mutex_unlock(&mapping->i_mmap_mutex);

653

mutex_unlock(&mapping->i_mmap_mutex);

654

655

mm->map_count++;

655

mm->map_count++;

656

validate_mm(mm);

656

validate_mm(mm);

657

}

657

}

658

659

/*

659

/*

660

* Helper for vma_adjust() in the split_vma insert case: insert a vma into the

660

* Helper for vma_adjust() in the split_vma insert case: insert a vma into the

661

* mm's list and rbtree. It has already been inserted into the interval tree.

661

* mm's list and rbtree. It has already been inserted into the interval tree.

662

*/

662

*/

663

static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)

663

static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)

664

{

664

{

665

struct vm_area_struct *prev;

665

struct vm_area_struct *prev;

666

struct rb_node **rb_link, *rb_parent;

666

struct rb_node **rb_link, *rb_parent;

667

668

if (find_vma_links(mm, vma->vm_start, vma->vm_end,

668

if (find_vma_links(mm, vma->vm_start, vma->vm_end,

669

&prev, &rb_link, &rb_parent))

669

&prev, &rb_link, &rb_parent))

670

BUG();

670

BUG();

671

__vma_link(mm, vma, prev, rb_link, rb_parent);

671

__vma_link(mm, vma, prev, rb_link, rb_parent);

672

mm->map_count++;

672

mm->map_count++;

673

}

673

}

674

675

static inline void

675

static inline void

676

__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,

676

__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,

677

struct vm_area_struct *prev)

677

struct vm_area_struct *prev)

678

{

678

{

679

struct vm_area_struct *next;

679

struct vm_area_struct *next;

680

681

vma_rb_erase(vma, &mm->mm_rb);

681

vma_rb_erase(vma, &mm->mm_rb);

682

prev->vm_next = next = vma->vm_next;

682

prev->vm_next = next = vma->vm_next;

683

if (next)

683

if (next)

684

next->vm_prev = prev;

684

next->vm_prev = prev;

685

if (mm->mmap_cache == vma)

685

if (mm->mmap_cache == vma)

686

mm->mmap_cache = prev;

686

mm->mmap_cache = prev;

687

}

687

}

688

689

/*

689

/*

690

* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that

690

* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that

691

* is already present in an i_mmap tree without adjusting the tree.

691

* is already present in an i_mmap tree without adjusting the tree.

692

* The following helper function should be used when such adjustments

692

* The following helper function should be used when such adjustments

693

* are necessary. The "insert" vma (if any) is to be inserted

693

* are necessary. The "insert" vma (if any) is to be inserted

694

* before we drop the necessary locks.

694

* before we drop the necessary locks.

695

*/

695

*/

696

int vma_adjust(struct vm_area_struct *vma, unsigned long start,

696

int vma_adjust(struct vm_area_struct *vma, unsigned long start,

697

unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)

697

unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)

698

{

698

{

699

struct mm_struct *mm = vma->vm_mm;

699

struct mm_struct *mm = vma->vm_mm;

700

struct vm_area_struct *next = vma->vm_next;

700

struct vm_area_struct *next = vma->vm_next;

701

struct vm_area_struct *importer = NULL;

701

struct vm_area_struct *importer = NULL;

702

struct address_space *mapping = NULL;

702

struct address_space *mapping = NULL;

703

struct rb_root *root = NULL;

703

struct rb_root *root = NULL;

704

struct anon_vma *anon_vma = NULL;

704

struct anon_vma *anon_vma = NULL;

705

struct file *file = vma->vm_file;

705

struct file *file = vma->vm_file;

706

bool start_changed = false, end_changed = false;

706

bool start_changed = false, end_changed = false;

707

long adjust_next = 0;

707

long adjust_next = 0;

708

int remove_next = 0;

708

int remove_next = 0;

709

710

if (next && !insert) {

710

if (next && !insert) {

711

struct vm_area_struct *exporter = NULL;

711

struct vm_area_struct *exporter = NULL;

712

713

if (end >= next->vm_end) {

713

if (end >= next->vm_end) {

714

/*

714

/*

715

* vma expands, overlapping all the next, and

715

* vma expands, overlapping all the next, and

716

* perhaps the one after too (mprotect case 6).

716

* perhaps the one after too (mprotect case 6).

717

*/

717

*/

718

again: remove_next = 1 + (end > next->vm_end);

718

again: remove_next = 1 + (end > next->vm_end);

719

end = next->vm_end;

719

end = next->vm_end;

720

exporter = next;

720

exporter = next;

721

importer = vma;

721

importer = vma;

722

} else if (end > next->vm_start) {

722

} else if (end > next->vm_start) {

723

/*

723

/*

724

* vma expands, overlapping part of the next:

724

* vma expands, overlapping part of the next:

725

* mprotect case 5 shifting the boundary up.

725

* mprotect case 5 shifting the boundary up.

726

*/

726

*/

727

adjust_next = (end - next->vm_start) >> PAGE_SHIFT;

727

adjust_next = (end - next->vm_start) >> PAGE_SHIFT;

728

exporter = next;

728

exporter = next;

729

importer = vma;

729

importer = vma;

730

} else if (end < vma->vm_end) {

730

} else if (end < vma->vm_end) {

731

/*

731

/*

732

* vma shrinks, and !insert tells it's not

732

* vma shrinks, and !insert tells it's not

733

* split_vma inserting another: so it must be

733

* split_vma inserting another: so it must be

734

* mprotect case 4 shifting the boundary down.

734

* mprotect case 4 shifting the boundary down.

735

*/

735

*/

736

adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);

736

adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);

737

exporter = vma;

737

exporter = vma;

738

importer = next;

738

importer = next;

739

}

739

}

740

741

/*

741

/*

742

* Easily overlooked: when mprotect shifts the boundary,

742

* Easily overlooked: when mprotect shifts the boundary,

743

* make sure the expanding vma has anon_vma set if the

743

* make sure the expanding vma has anon_vma set if the

744

* shrinking vma had, to cover any anon pages imported.

744

* shrinking vma had, to cover any anon pages imported.

745

*/

745

*/

746

if (exporter && exporter->anon_vma && !importer->anon_vma) {

746

if (exporter && exporter->anon_vma && !importer->anon_vma) {

747

if (anon_vma_clone(importer, exporter))

747

if (anon_vma_clone(importer, exporter))

748

return -ENOMEM;

748

return -ENOMEM;

749

importer->anon_vma = exporter->anon_vma;

749

importer->anon_vma = exporter->anon_vma;

750

}

750

}

751

}

751

}

752

753

if (file) {

753

if (file) {

754

mapping = file->f_mapping;

754

mapping = file->f_mapping;

755

if (!(vma->vm_flags & VM_NONLINEAR)) {

755

if (!(vma->vm_flags & VM_NONLINEAR)) {

756

root = &mapping->i_mmap;

756

root = &mapping->i_mmap;

757

uprobe_munmap(vma, vma->vm_start, vma->vm_end);

757

uprobe_munmap(vma, vma->vm_start, vma->vm_end);

758

759

if (adjust_next)

759

if (adjust_next)

760

uprobe_munmap(next, next->vm_start,

760

uprobe_munmap(next, next->vm_start,

761

next->vm_end);

761

next->vm_end);

762

}

762

}

763

764

mutex_lock(&mapping->i_mmap_mutex);

764

mutex_lock(&mapping->i_mmap_mutex);

765

if (insert) {

765

if (insert) {

766

/*

766

/*

767

* Put into interval tree now, so instantiated pages

767

* Put into interval tree now, so instantiated pages

768

* are visible to arm/parisc __flush_dcache_page

768

* are visible to arm/parisc __flush_dcache_page

769

* throughout; but we cannot insert into address

769

* throughout; but we cannot insert into address

770

* space until vma start or end is updated.

770

* space until vma start or end is updated.

771

*/

771

*/

772

__vma_link_file(insert);

772

__vma_link_file(insert);

773

}

773

}

774

}

774

}

775

776

vma_adjust_trans_huge(vma, start, end, adjust_next);

776

vma_adjust_trans_huge(vma, start, end, adjust_next);

777

778

anon_vma = vma->anon_vma;

778

anon_vma = vma->anon_vma;

779

if (!anon_vma && adjust_next)

779

if (!anon_vma && adjust_next)

780

anon_vma = next->anon_vma;

780

anon_vma = next->anon_vma;

781

if (anon_vma) {

781

if (anon_vma) {

782

VM_BUG_ON(adjust_next && next->anon_vma &&

782

VM_BUG_ON(adjust_next && next->anon_vma &&

783

anon_vma != next->anon_vma);

783

anon_vma != next->anon_vma);

784

anon_vma_lock_write(anon_vma);

784

anon_vma_lock_write(anon_vma);

785

anon_vma_interval_tree_pre_update_vma(vma);

785

anon_vma_interval_tree_pre_update_vma(vma);

786

if (adjust_next)

786

if (adjust_next)

787

anon_vma_interval_tree_pre_update_vma(next);

787

anon_vma_interval_tree_pre_update_vma(next);

788

}

788

}

789

790

if (root) {

790

if (root) {

791

flush_dcache_mmap_lock(mapping);

791

flush_dcache_mmap_lock(mapping);

792

vma_interval_tree_remove(vma, root);

792

vma_interval_tree_remove(vma, root);

793

if (adjust_next)

793

if (adjust_next)

794

vma_interval_tree_remove(next, root);

794

vma_interval_tree_remove(next, root);

795

}

795

}

796

797

if (start != vma->vm_start) {

797

if (start != vma->vm_start) {

798

vma->vm_start = start;

798

vma->vm_start = start;

799

start_changed = true;

799

start_changed = true;

800

}

800

}

801

if (end != vma->vm_end) {

801

if (end != vma->vm_end) {

802

vma->vm_end = end;

802

vma->vm_end = end;

803

end_changed = true;

803

end_changed = true;

804

}

804

}

805

vma->vm_pgoff = pgoff;

805

vma->vm_pgoff = pgoff;

806

if (adjust_next) {

806

if (adjust_next) {

807

next->vm_start += adjust_next << PAGE_SHIFT;

807

next->vm_start += adjust_next << PAGE_SHIFT;

808

next->vm_pgoff += adjust_next;

808

next->vm_pgoff += adjust_next;

809

}

809

}

810

811

if (root) {

811

if (root) {

812

if (adjust_next)

812

if (adjust_next)

813

vma_interval_tree_insert(next, root);

813

vma_interval_tree_insert(next, root);

814

vma_interval_tree_insert(vma, root);

814

vma_interval_tree_insert(vma, root);

815

flush_dcache_mmap_unlock(mapping);

815

flush_dcache_mmap_unlock(mapping);

816

}

816

}

817

818

if (remove_next) {

818

if (remove_next) {

819

/*

819

/*

820

* vma_merge has merged next into vma, and needs

820

* vma_merge has merged next into vma, and needs

821

* us to remove next before dropping the locks.

821

* us to remove next before dropping the locks.

822

*/

822

*/

823

__vma_unlink(mm, next, vma);

823

__vma_unlink(mm, next, vma);

824

if (file)

824

if (file)

825

__remove_shared_vm_struct(next, file, mapping);

825

__remove_shared_vm_struct(next, file, mapping);

826

} else if (insert) {

826

} else if (insert) {

827

/*

827

/*

828

* split_vma has split insert from vma, and needs

828

* split_vma has split insert from vma, and needs

829

* us to insert it before dropping the locks

829

* us to insert it before dropping the locks

830

* (it may either follow vma or precede it).

830

* (it may either follow vma or precede it).

831

*/

831

*/

832

__insert_vm_struct(mm, insert);

832

__insert_vm_struct(mm, insert);

833

} else {

833

} else {

834

if (start_changed)

834

if (start_changed)

835

vma_gap_update(vma);

835

vma_gap_update(vma);

836

if (end_changed) {

836

if (end_changed) {

837

if (!next)

837

if (!next)

838

mm->highest_vm_end = end;

838

mm->highest_vm_end = end;

839

else if (!adjust_next)

839

else if (!adjust_next)

840

vma_gap_update(next);

840

vma_gap_update(next);

841

}

841

}

842

}

842

}

843

844

if (anon_vma) {

844

if (anon_vma) {

845

anon_vma_interval_tree_post_update_vma(vma);

845

anon_vma_interval_tree_post_update_vma(vma);

846

if (adjust_next)

846

if (adjust_next)

847

anon_vma_interval_tree_post_update_vma(next);

847

anon_vma_interval_tree_post_update_vma(next);

848

anon_vma_unlock_write(anon_vma);

848

anon_vma_unlock_write(anon_vma);

849

}

849

}

850

if (mapping)

850

if (mapping)

851

mutex_unlock(&mapping->i_mmap_mutex);

851

mutex_unlock(&mapping->i_mmap_mutex);

852

853

if (root) {

853

if (root) {

854

uprobe_mmap(vma);

854

uprobe_mmap(vma);

855

856

if (adjust_next)

856

if (adjust_next)

857

uprobe_mmap(next);

857

uprobe_mmap(next);

858

}

858

}

859

860

if (remove_next) {

860

if (remove_next) {

861

if (file) {

861

if (file) {

862

uprobe_munmap(next, next->vm_start, next->vm_end);

862

uprobe_munmap(next, next->vm_start, next->vm_end);

863

fput(file);

863

fput(file);

864

}

864

}

865

if (next->anon_vma)

865

if (next->anon_vma)

866

anon_vma_merge(vma, next);

866

anon_vma_merge(vma, next);

867

mm->map_count--;

867

mm->map_count--;

868

vma_set_policy(vma, vma_policy(next));

868

vma_set_policy(vma, vma_policy(next));

869

kmem_cache_free(vm_area_cachep, next);

869

kmem_cache_free(vm_area_cachep, next);

870

/*

870

/*

871

* In mprotect's case 6 (see comments on vma_merge),

871

* In mprotect's case 6 (see comments on vma_merge),

872

* we must remove another next too. It would clutter

872

* we must remove another next too. It would clutter

873

* up the code too much to do both in one go.

873

* up the code too much to do both in one go.

874

*/

874

*/

875

next = vma->vm_next;

875

next = vma->vm_next;

876

if (remove_next == 2)

876

if (remove_next == 2)

877

goto again;

877

goto again;

878

else if (next)

878

else if (next)

879

vma_gap_update(next);

879

vma_gap_update(next);

880

else

880

else

881

mm->highest_vm_end = end;

881

mm->highest_vm_end = end;

882

}

882

}

883

if (insert && file)

883

if (insert && file)

884

uprobe_mmap(insert);

884

uprobe_mmap(insert);

885

886

validate_mm(mm);

886

validate_mm(mm);

887

888

return 0;

888

return 0;

889

}

889

}

890

891

/*

891

/*

892

* If the vma has a ->close operation then the driver probably needs to release

892

* If the vma has a ->close operation then the driver probably needs to release

893

* per-vma resources, so we don't attempt to merge those.

893

* per-vma resources, so we don't attempt to merge those.

894

*/

894

*/

895

static inline int is_mergeable_vma(struct vm_area_struct *vma,

895

static inline int is_mergeable_vma(struct vm_area_struct *vma,

896

struct file *file, unsigned long vm_flags)

896

struct file *file, unsigned long vm_flags)

897

{

897

{

898

if (vma->vm_flags ^ vm_flags)

898

if (vma->vm_flags ^ vm_flags)

899

return 0;

899

return 0;

900

if (vma->vm_file != file)

900

if (vma->vm_file != file)

901

return 0;

901

return 0;

902

if (vma->vm_ops && vma->vm_ops->close)

902

if (vma->vm_ops && vma->vm_ops->close)

903

return 0;

903

return 0;

904

return 1;

904

return 1;

905

}

905

}

906

907

static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,

907

static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,

908

struct anon_vma *anon_vma2,

908

struct anon_vma *anon_vma2,

909

struct vm_area_struct *vma)

909

struct vm_area_struct *vma)

910

{

910

{

911

/*

911

/*

912

* The list_is_singular() test is to avoid merging VMA cloned from

912

* The list_is_singular() test is to avoid merging VMA cloned from

913

* parents. This can improve scalability caused by anon_vma lock.

913

* parents. This can improve scalability caused by anon_vma lock.

914

*/

914

*/

915

if ((!anon_vma1 || !anon_vma2) && (!vma ||

915

if ((!anon_vma1 || !anon_vma2) && (!vma ||

916

list_is_singular(&vma->anon_vma_chain)))

916

list_is_singular(&vma->anon_vma_chain)))

917

return 1;

917

return 1;

918

return anon_vma1 == anon_vma2;

918

return anon_vma1 == anon_vma2;

919

}

919

}

920

921

/*

921

/*

922

* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)

922

* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)

923

* in front of (at a lower virtual address and file offset than) the vma.

923

* in front of (at a lower virtual address and file offset than) the vma.

924

*

924

*

925

* We cannot merge two vmas if they have differently assigned (non-NULL)

925

* We cannot merge two vmas if they have differently assigned (non-NULL)

926

* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.

926

* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.

927

*

927

*

928

* We don't check here for the merged mmap wrapping around the end of pagecache

928

* We don't check here for the merged mmap wrapping around the end of pagecache

929

* indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which

929

* indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which

930

* wrap, nor mmaps which cover the final page at index -1UL.

930

* wrap, nor mmaps which cover the final page at index -1UL.

931

*/

931

*/

932

static int

932

static int

933

can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,

933

can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,

934

struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)

934

struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)

935

{

935

{

936

if (is_mergeable_vma(vma, file, vm_flags) &&

936

if (is_mergeable_vma(vma, file, vm_flags) &&

937

is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {

937

is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {

938

if (vma->vm_pgoff == vm_pgoff)

938

if (vma->vm_pgoff == vm_pgoff)

939

return 1;

939

return 1;

940

}

940

}

941

return 0;

941

return 0;

942

}

942

}

943

944

/*

944

/*

945

* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)

945

* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)

946

* beyond (at a higher virtual address and file offset than) the vma.

946

* beyond (at a higher virtual address and file offset than) the vma.

947

*

947

*

948

* We cannot merge two vmas if they have differently assigned (non-NULL)

948

* We cannot merge two vmas if they have differently assigned (non-NULL)

949

* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.

949

* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.

950

*/

950

*/

951

static int

951

static int

952

can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,

952

can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,

953

struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)

953

struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)

954

{

954

{

955

if (is_mergeable_vma(vma, file, vm_flags) &&

955

if (is_mergeable_vma(vma, file, vm_flags) &&

956

is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {

956

is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {

957

pgoff_t vm_pglen;

957

pgoff_t vm_pglen;

958

vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;

958

vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;

959

if (vma->vm_pgoff + vm_pglen == vm_pgoff)

959

if (vma->vm_pgoff + vm_pglen == vm_pgoff)

960

return 1;

960

return 1;

961

}

961

}

962

return 0;

962

return 0;

963

}

963

}

964

965

/*

965

/*

966

* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out

966

* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out

967

* whether that can be merged with its predecessor or its successor.

967

* whether that can be merged with its predecessor or its successor.

968

* Or both (it neatly fills a hole).

968

* Or both (it neatly fills a hole).

969

*

969

*

970

* In most cases - when called for mmap, brk or mremap - [addr,end) is

970

* In most cases - when called for mmap, brk or mremap - [addr,end) is

971

* certain not to be mapped by the time vma_merge is called; but when

971

* certain not to be mapped by the time vma_merge is called; but when

972

* called for mprotect, it is certain to be already mapped (either at

972

* called for mprotect, it is certain to be already mapped (either at

973

* an offset within prev, or at the start of next), and the flags of

973

* an offset within prev, or at the start of next), and the flags of

974

* this area are about to be changed to vm_flags - and the no-change

974

* this area are about to be changed to vm_flags - and the no-change

975

* case has already been eliminated.

975

* case has already been eliminated.

976

*

976

*

977

* The following mprotect cases have to be considered, where AAAA is

977

* The following mprotect cases have to be considered, where AAAA is

978

* the area passed down from mprotect_fixup, never extending beyond one

978

* the area passed down from mprotect_fixup, never extending beyond one

979

* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:

979

* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:

980

*

980

*

981

* AAAA AAAA AAAA AAAA

981

* AAAA AAAA AAAA AAAA

982

* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX

982

* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX

983

* cannot merge might become might become might become

983

* cannot merge might become might become might become

984

* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or

984

* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or

985

* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or

985

* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or

986

* mremap move: PPPPNNNNNNNN 8

986

* mremap move: PPPPNNNNNNNN 8

987

* AAAA

987

* AAAA

988

* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN

988

* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN

989

* might become case 1 below case 2 below case 3 below

989

* might become case 1 below case 2 below case 3 below

990

*

990

*

991

* Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:

991

* Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:

992

* mprotect_fixup updates vm_flags & vm_page_prot on successful return.

992

* mprotect_fixup updates vm_flags & vm_page_prot on successful return.

993

*/

993

*/

994

struct vm_area_struct *vma_merge(struct mm_struct *mm,

994

struct vm_area_struct *vma_merge(struct mm_struct *mm,

995

struct vm_area_struct *prev, unsigned long addr,

995

struct vm_area_struct *prev, unsigned long addr,

996

unsigned long end, unsigned long vm_flags,

996

unsigned long end, unsigned long vm_flags,

997

struct anon_vma *anon_vma, struct file *file,

997

struct anon_vma *anon_vma, struct file *file,

998

pgoff_t pgoff, struct mempolicy *policy)

998

pgoff_t pgoff, struct mempolicy *policy)

999

{

999

{

1000

pgoff_t pglen = (end - addr) >> PAGE_SHIFT;

1000

pgoff_t pglen = (end - addr) >> PAGE_SHIFT;

1001

struct vm_area_struct *area, *next;

1001

struct vm_area_struct *area, *next;

1002

int err;

1002

int err;

1003

1004

/*

1004

/*

1005

* We later require that vma->vm_flags == vm_flags,

1005

* We later require that vma->vm_flags == vm_flags,

1006

* so this tests vma->vm_flags & VM_SPECIAL, too.

1006

* so this tests vma->vm_flags & VM_SPECIAL, too.

1007

*/

1007

*/

1008

if (vm_flags & VM_SPECIAL)

1008

if (vm_flags & VM_SPECIAL)

1009

return NULL;

1009

return NULL;

1010

1011

if (prev)

1011

if (prev)

1012

next = prev->vm_next;

1012

next = prev->vm_next;

1013

else

1013

else

1014

next = mm->mmap;

1014

next = mm->mmap;

1015

area = next;

1015

area = next;

1016

if (next && next->vm_end == end) /* cases 6, 7, 8 */

1016

if (next && next->vm_end == end) /* cases 6, 7, 8 */

1017

next = next->vm_next;

1017

next = next->vm_next;

1018

1019

/*

1019

/*

1020

* Can it merge with the predecessor?

1020

* Can it merge with the predecessor?

1021

*/

1021

*/

1022

if (prev && prev->vm_end == addr &&

1022

if (prev && prev->vm_end == addr &&

1023

mpol_equal(vma_policy(prev), policy) &&

1023

mpol_equal(vma_policy(prev), policy) &&

1024

can_vma_merge_after(prev, vm_flags,

1024

can_vma_merge_after(prev, vm_flags,

1025

anon_vma, file, pgoff)) {

1025

anon_vma, file, pgoff)) {

1026

/*

1026

/*

1027

* OK, it can. Can we now merge in the successor as well?

1027

* OK, it can. Can we now merge in the successor as well?

1028

*/

1028

*/

1029

if (next && end == next->vm_start &&

1029

if (next && end == next->vm_start &&

1030

mpol_equal(policy, vma_policy(next)) &&

1030

mpol_equal(policy, vma_policy(next)) &&

1031

can_vma_merge_before(next, vm_flags,

1031

can_vma_merge_before(next, vm_flags,

1032

anon_vma, file, pgoff+pglen) &&

1032

anon_vma, file, pgoff+pglen) &&

1033

is_mergeable_anon_vma(prev->anon_vma,

1033

is_mergeable_anon_vma(prev->anon_vma,

1034

next->anon_vma, NULL)) {

1034

next->anon_vma, NULL)) {

1035

/* cases 1, 6 */

1035

/* cases 1, 6 */

1036

err = vma_adjust(prev, prev->vm_start,

1036

err = vma_adjust(prev, prev->vm_start,

1037

next->vm_end, prev->vm_pgoff, NULL);

1037

next->vm_end, prev->vm_pgoff, NULL);

1038

} else /* cases 2, 5, 7 */

1038

} else /* cases 2, 5, 7 */

1039

err = vma_adjust(prev, prev->vm_start,

1039

err = vma_adjust(prev, prev->vm_start,

1040

end, prev->vm_pgoff, NULL);

1040

end, prev->vm_pgoff, NULL);

1041

if (err)

1041

if (err)

1042

return NULL;

1042

return NULL;

1043

khugepaged_enter_vma_merge(prev);

1043

khugepaged_enter_vma_merge(prev);

1044

return prev;

1044

return prev;

1045

}

1045

}

1046

1047

/*

1047

/*

1048

* Can this new request be merged in front of next?

1048

* Can this new request be merged in front of next?

1049

*/

1049

*/

1050

if (next && end == next->vm_start &&

1050

if (next && end == next->vm_start &&

1051

mpol_equal(policy, vma_policy(next)) &&

1051

mpol_equal(policy, vma_policy(next)) &&

1052

can_vma_merge_before(next, vm_flags,

1052

can_vma_merge_before(next, vm_flags,

1053

anon_vma, file, pgoff+pglen)) {

1053

anon_vma, file, pgoff+pglen)) {

1054

if (prev && addr < prev->vm_end) /* case 4 */

1054

if (prev && addr < prev->vm_end) /* case 4 */

1055

err = vma_adjust(prev, prev->vm_start,

1055

err = vma_adjust(prev, prev->vm_start,

1056

addr, prev->vm_pgoff, NULL);

1056

addr, prev->vm_pgoff, NULL);

1057

else /* cases 3, 8 */

1057

else /* cases 3, 8 */

1058

err = vma_adjust(area, addr, next->vm_end,

1058

err = vma_adjust(area, addr, next->vm_end,

1059

next->vm_pgoff - pglen, NULL);

1059

next->vm_pgoff - pglen, NULL);

1060

if (err)

1060

if (err)

1061

return NULL;

1061

return NULL;

1062

khugepaged_enter_vma_merge(area);

1062

khugepaged_enter_vma_merge(area);

1063

return area;

1063

return area;

1064

}

1064

}

1065

1066

return NULL;

1066

return NULL;

1067

}

1067

}

1068

1069

/*

1069

/*

1070

* Rough compatbility check to quickly see if it's even worth looking

1070

* Rough compatbility check to quickly see if it's even worth looking

1071

* at sharing an anon_vma.

1071

* at sharing an anon_vma.

1072

*

1072

*

1073

* They need to have the same vm_file, and the flags can only differ

1073

* They need to have the same vm_file, and the flags can only differ

1074

* in things that mprotect may change.

1074

* in things that mprotect may change.

1075

*

1075

*

1076

* NOTE! The fact that we share an anon_vma doesn't _have_ to mean that

1076

* NOTE! The fact that we share an anon_vma doesn't _have_ to mean that

1077

* we can merge the two vma's. For example, we refuse to merge a vma if

1077

* we can merge the two vma's. For example, we refuse to merge a vma if

1078

* there is a vm_ops->close() function, because that indicates that the

1078

* there is a vm_ops->close() function, because that indicates that the

1079

* driver is doing some kind of reference counting. But that doesn't

1079

* driver is doing some kind of reference counting. But that doesn't

1080

* really matter for the anon_vma sharing case.

1080

* really matter for the anon_vma sharing case.

1081

*/

1081

*/

1082

static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)

1082

static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)

1083

{

1083

{

1084

return a->vm_end == b->vm_start &&

1084

return a->vm_end == b->vm_start &&

1085

mpol_equal(vma_policy(a), vma_policy(b)) &&

1085

mpol_equal(vma_policy(a), vma_policy(b)) &&

1086

a->vm_file == b->vm_file &&

1086

a->vm_file == b->vm_file &&

1087

!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&

1087

!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&

1088

b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);

1088

b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);

1089

}

1089

}

1090

1091

/*

1091

/*

1092

* Do some basic sanity checking to see if we can re-use the anon_vma

1092

* Do some basic sanity checking to see if we can re-use the anon_vma

1093

* from 'old'. The 'a'/'b' vma's are in VM order - one of them will be

1093

* from 'old'. The 'a'/'b' vma's are in VM order - one of them will be

1094

* the same as 'old', the other will be the new one that is trying

1094

* the same as 'old', the other will be the new one that is trying

1095

* to share the anon_vma.

1095

* to share the anon_vma.

1096

*

1096

*

1097

* NOTE! This runs with mm_sem held for reading, so it is possible that

1097

* NOTE! This runs with mm_sem held for reading, so it is possible that

1098

* the anon_vma of 'old' is concurrently in the process of being set up

1098

* the anon_vma of 'old' is concurrently in the process of being set up

1099

* by another page fault trying to merge _that_. But that's ok: if it

1099

* by another page fault trying to merge _that_. But that's ok: if it

1100

* is being set up, that automatically means that it will be a singleton

1100

* is being set up, that automatically means that it will be a singleton

1101

* acceptable for merging, so we can do all of this optimistically. But

1101

* acceptable for merging, so we can do all of this optimistically. But

1102

* we do that ACCESS_ONCE() to make sure that we never re-load the pointer.

1102

* we do that ACCESS_ONCE() to make sure that we never re-load the pointer.

1103

*

1103

*

1104

* IOW: that the "list_is_singular()" test on the anon_vma_chain only

1104

* IOW: that the "list_is_singular()" test on the anon_vma_chain only

1105

* matters for the 'stable anon_vma' case (ie the thing we want to avoid

1105

* matters for the 'stable anon_vma' case (ie the thing we want to avoid

1106

* is to return an anon_vma that is "complex" due to having gone through

1106

* is to return an anon_vma that is "complex" due to having gone through

1107

* a fork).

1107

* a fork).

1108

*

1108

*

1109

* We also make sure that the two vma's are compatible (adjacent,

1109

* We also make sure that the two vma's are compatible (adjacent,

1110

* and with the same memory policies). That's all stable, even with just

1110

* and with the same memory policies). That's all stable, even with just

1111

* a read lock on the mm_sem.

1111

* a read lock on the mm_sem.

1112

*/

1112

*/

1113

static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)

1113

static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)

1114

{

1114

{

1115

if (anon_vma_compatible(a, b)) {

1115

if (anon_vma_compatible(a, b)) {

1116

struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);

1116

struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);

1117

1118

if (anon_vma && list_is_singular(&old->anon_vma_chain))

1118

if (anon_vma && list_is_singular(&old->anon_vma_chain))

1119

return anon_vma;

1119

return anon_vma;

1120

}

1120

}

1121

return NULL;

1121

return NULL;

1122

}

1122

}

1123

1124

/*

1124

/*

1125

* find_mergeable_anon_vma is used by anon_vma_prepare, to check

1125

* find_mergeable_anon_vma is used by anon_vma_prepare, to check

1126

* neighbouring vmas for a suitable anon_vma, before it goes off

1126

* neighbouring vmas for a suitable anon_vma, before it goes off

1127

* to allocate a new anon_vma. It checks because a repetitive

1127

* to allocate a new anon_vma. It checks because a repetitive

1128

* sequence of mprotects and faults may otherwise lead to distinct

1128

* sequence of mprotects and faults may otherwise lead to distinct

1129

* anon_vmas being allocated, preventing vma merge in subsequent

1129

* anon_vmas being allocated, preventing vma merge in subsequent

1130

* mprotect.

1130

* mprotect.

1131

*/

1131

*/

1132

struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)

1132

struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)

1133

{

1133

{

1134

struct anon_vma *anon_vma;

1134

struct anon_vma *anon_vma;

1135

struct vm_area_struct *near;

1135

struct vm_area_struct *near;

1136

1137

near = vma->vm_next;

1137

near = vma->vm_next;

1138

if (!near)

1138

if (!near)

1139

goto try_prev;

1139

goto try_prev;

1140

1141

anon_vma = reusable_anon_vma(near, vma, near);

1141

anon_vma = reusable_anon_vma(near, vma, near);

1142

if (anon_vma)

1142

if (anon_vma)

1143

return anon_vma;

1143

return anon_vma;

1144

try_prev:

1144

try_prev:

1145

near = vma->vm_prev;

1145

near = vma->vm_prev;

1146

if (!near)

1146

if (!near)

1147

goto none;

1147

goto none;

1148

1149

anon_vma = reusable_anon_vma(near, near, vma);

1149

anon_vma = reusable_anon_vma(near, near, vma);

1150

if (anon_vma)

1150

if (anon_vma)

1151

return anon_vma;

1151

return anon_vma;

1152

none:

1152

none:

1153

/*

1153

/*

1154

* There's no absolute need to look only at touching neighbours:

1154

* There's no absolute need to look only at touching neighbours:

1155

* we could search further afield for "compatible" anon_vmas.

1155

* we could search further afield for "compatible" anon_vmas.

1156

* But it would probably just be a waste of time searching,

1156

* But it would probably just be a waste of time searching,

1157

* or lead to too many vmas hanging off the same anon_vma.

1157

* or lead to too many vmas hanging off the same anon_vma.

1158

* We're trying to allow mprotect remerging later on,

1158

* We're trying to allow mprotect remerging later on,

1159

* not trying to minimize memory used for anon_vmas.

1159

* not trying to minimize memory used for anon_vmas.

1160

*/

1160

*/

1161

return NULL;

1161

return NULL;

1162

}

1162

}

1163

1164

#ifdef CONFIG_PROC_FS

1164

#ifdef CONFIG_PROC_FS

1165

void vm_stat_account(struct mm_struct *mm, unsigned long flags,

1165

void vm_stat_account(struct mm_struct *mm, unsigned long flags,

1166

struct file *file, long pages)

1166

struct file *file, long pages)

1167

{

1167

{

1168

const unsigned long stack_flags

1168

const unsigned long stack_flags

1169

= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);

1169

= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);

1170

1171

mm->total_vm += pages;

1171

mm->total_vm += pages;

1172

1173

if (file) {

1173

if (file) {

1174

mm->shared_vm += pages;

1174

mm->shared_vm += pages;

1175

if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)

1175

if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)

1176

mm->exec_vm += pages;

1176

mm->exec_vm += pages;

1177

} else if (flags & stack_flags)

1177

} else if (flags & stack_flags)

1178

mm->stack_vm += pages;

1178

mm->stack_vm += pages;

1179

}

1179

}

1180

#endif /* CONFIG_PROC_FS */

1180

#endif /* CONFIG_PROC_FS */

1181

1182

/*

1182

/*

1183

* If a hint addr is less than mmap_min_addr change hint to be as

1183

* If a hint addr is less than mmap_min_addr change hint to be as

1184

* low as possible but still greater than mmap_min_addr

1184

* low as possible but still greater than mmap_min_addr

1185

*/

1185

*/

1186

static inline unsigned long round_hint_to_min(unsigned long hint)

1186

static inline unsigned long round_hint_to_min(unsigned long hint)

1187

{

1187

{

1188

hint &= PAGE_MASK;

1188

hint &= PAGE_MASK;

1189

if (((void *)hint != NULL) &&

1189

if (((void *)hint != NULL) &&

1190

(hint < mmap_min_addr))

1190

(hint < mmap_min_addr))

1191

return PAGE_ALIGN(mmap_min_addr);

1191

return PAGE_ALIGN(mmap_min_addr);

1192

return hint;

1192

return hint;

1193

}

1193

}

1194

1195

/*

1195

/*

1196

* The caller must hold down_write(&current->mm->mmap_sem).

1196

* The caller must hold down_write(&current->mm->mmap_sem).

1197

*/

1197

*/

1198

1199

unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,

1199

unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,

1200

unsigned long len, unsigned long prot,

1200

unsigned long len, unsigned long prot,

1201

unsigned long flags, unsigned long pgoff,

1201

unsigned long flags, unsigned long pgoff,

1202

unsigned long *populate)

1202

unsigned long *populate)

1203

{

1203

{

1204

struct mm_struct * mm = current->mm;

1204

struct mm_struct * mm = current->mm;

1205

struct inode *inode;

1205

struct inode *inode;

1206

vm_flags_t vm_flags;

1206

vm_flags_t vm_flags;

1207

1208

*populate = 0;

1208

*populate = 0;

1209

1210

/*

1210

/*

1211

* Does the application expect PROT_READ to imply PROT_EXEC?

1211

* Does the application expect PROT_READ to imply PROT_EXEC?

1212

*

1212

*

1213

* (the exception is when the underlying filesystem is noexec

1213

* (the exception is when the underlying filesystem is noexec

1214

* mounted, in which case we dont add PROT_EXEC.)

1214

* mounted, in which case we dont add PROT_EXEC.)

1215

*/

1215

*/

1216

if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))

1216

if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))

1217

if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))

1217

if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))

1218

prot |= PROT_EXEC;

1218

prot |= PROT_EXEC;

1219

1220

if (!len)

1220

if (!len)

1221

return -EINVAL;

1221

return -EINVAL;

1222

1223

if (!(flags & MAP_FIXED))

1223

if (!(flags & MAP_FIXED))

1224

addr = round_hint_to_min(addr);

1224

addr = round_hint_to_min(addr);

1225

1226

/* Careful about overflows.. */

1226

/* Careful about overflows.. */

1227

len = PAGE_ALIGN(len);

1227

len = PAGE_ALIGN(len);

1228

if (!len)

1228

if (!len)

1229

return -ENOMEM;

1229

return -ENOMEM;

1230

1231

/* offset overflow? */

1231

/* offset overflow? */

1232

if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)

1232

if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)

1233

return -EOVERFLOW;

1233

return -EOVERFLOW;

1234

1235

/* Too many mappings? */

1235

/* Too many mappings? */

1236

if (mm->map_count > sysctl_max_map_count)

1236

if (mm->map_count > sysctl_max_map_count)

1237

return -ENOMEM;

1237

return -ENOMEM;

1238

1239

/* Obtain the address to map to. we verify (or select) it and ensure

1239

/* Obtain the address to map to. we verify (or select) it and ensure

1240

* that it represents a valid section of the address space.

1240

* that it represents a valid section of the address space.

1241

*/

1241

*/

1242

addr = get_unmapped_area(file, addr, len, pgoff, flags);

1242

addr = get_unmapped_area(file, addr, len, pgoff, flags);

1243

if (addr & ~PAGE_MASK)

1243

if (addr & ~PAGE_MASK)

1244

return addr;

1244

return addr;

1245

1246

/* Do simple checking here so the lower-level routines won't have

1246

/* Do simple checking here so the lower-level routines won't have

1247

* to. we assume access permissions have been handled by the open

1247

* to. we assume access permissions have been handled by the open

1248

* of the memory object, so we don't do any here.

1248

* of the memory object, so we don't do any here.

1249

*/

1249

*/

1250

vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |

1250

vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |

1251

mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

1251

mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

1252

1253

if (flags & MAP_LOCKED)

1253

if (flags & MAP_LOCKED)

1254

if (!can_do_mlock())

1254

if (!can_do_mlock())

1255

return -EPERM;

1255

return -EPERM;

1256

1257

/* mlock MCL_FUTURE? */

1257

/* mlock MCL_FUTURE? */

1258

if (vm_flags & VM_LOCKED) {

1258

if (vm_flags & VM_LOCKED) {

1259

unsigned long locked, lock_limit;

1259

unsigned long locked, lock_limit;

1260

locked = len >> PAGE_SHIFT;

1260

locked = len >> PAGE_SHIFT;

1261

locked += mm->locked_vm;

1261

locked += mm->locked_vm;

1262

lock_limit = rlimit(RLIMIT_MEMLOCK);

1262

lock_limit = rlimit(RLIMIT_MEMLOCK);

1263

lock_limit >>= PAGE_SHIFT;

1263

lock_limit >>= PAGE_SHIFT;

1264

if (locked > lock_limit && !capable(CAP_IPC_LOCK))

1264

if (locked > lock_limit && !capable(CAP_IPC_LOCK))

1265

return -EAGAIN;

1265

return -EAGAIN;

1266

}

1266

}

1267

1268

inode = file ? file_inode(file) : NULL;

1268

inode = file ? file_inode(file) : NULL;

1269

1270

if (file) {

1270

if (file) {

1271

switch (flags & MAP_TYPE) {

1271

switch (flags & MAP_TYPE) {

1272

case MAP_SHARED:

1272

case MAP_SHARED:

1273

if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

1273

if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

1274

return -EACCES;

1274

return -EACCES;

1275

1276

/*

1276

/*

1277

* Make sure we don't allow writing to an append-only

1277

* Make sure we don't allow writing to an append-only

1278

* file..

1278

* file..

1279

*/

1279

*/

1280

if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))

1280

if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))

1281

return -EACCES;

1281

return -EACCES;

1282

1283

/*

1283

/*

1284

* Make sure there are no mandatory locks on the file.

1284

* Make sure there are no mandatory locks on the file.

1285

*/

1285

*/

1286

if (locks_verify_locked(inode))

1286

if (locks_verify_locked(inode))

1287

return -EAGAIN;

1287

return -EAGAIN;

1288

1289

vm_flags |= VM_SHARED | VM_MAYSHARE;

1289

vm_flags |= VM_SHARED | VM_MAYSHARE;

1290

if (!(file->f_mode & FMODE_WRITE))

1290

if (!(file->f_mode & FMODE_WRITE))

1291

vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

1291

vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

1292

1293

/* fall through */

1293

/* fall through */

1294

case MAP_PRIVATE:

1294

case MAP_PRIVATE:

1295

if (!(file->f_mode & FMODE_READ))

1295

if (!(file->f_mode & FMODE_READ))

1296

return -EACCES;

1296

return -EACCES;

1297

if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {

1297

if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {

1298

if (vm_flags & VM_EXEC)

1298

if (vm_flags & VM_EXEC)

1299

return -EPERM;

1299

return -EPERM;

1300

vm_flags &= ~VM_MAYEXEC;

1300

vm_flags &= ~VM_MAYEXEC;

1301

}

1301

}

1302

1303

if (!file->f_op || !file->f_op->mmap)

1303

if (!file->f_op || !file->f_op->mmap)

1304

return -ENODEV;

1304

return -ENODEV;

1305

break;

1305

break;

1306

1307

default:

1307

default:

1308

return -EINVAL;

1308

return -EINVAL;

1309

}

1309

}

1310

} else {

1310

} else {

1311

switch (flags & MAP_TYPE) {

1311

switch (flags & MAP_TYPE) {

1312

case MAP_SHARED:

1312

case MAP_SHARED:

1313

/*

1313

/*

1314

* Ignore pgoff.

1314

* Ignore pgoff.

1315

*/

1315

*/

1316

pgoff = 0;

1316

pgoff = 0;

1317

vm_flags |= VM_SHARED | VM_MAYSHARE;

1317

vm_flags |= VM_SHARED | VM_MAYSHARE;

1318

break;

1318

break;

1319

case MAP_PRIVATE:

1319

case MAP_PRIVATE:

1320

/*

1320

/*

1321

* Set pgoff according to addr for anon_vma.

1321

* Set pgoff according to addr for anon_vma.

1322

*/

1322

*/

1323

pgoff = addr >> PAGE_SHIFT;

1323

pgoff = addr >> PAGE_SHIFT;

1324

break;

1324

break;

1325

default:

1325

default:

1326

return -EINVAL;

1326

return -EINVAL;

1327

}

1327

}

1328

}

1328

}

1329

1330

/*

1330

/*

1331

* Set 'VM_NORESERVE' if we should not account for the

1331

* Set 'VM_NORESERVE' if we should not account for the

1332

* memory use of this mapping.

1332

* memory use of this mapping.

1333

*/

1333

*/

1334

if (flags & MAP_NORESERVE) {

1334

if (flags & MAP_NORESERVE) {

1335

/* We honor MAP_NORESERVE if allowed to overcommit */

1335

/* We honor MAP_NORESERVE if allowed to overcommit */

1336

if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)

1336

if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)

1337

vm_flags |= VM_NORESERVE;

1337

vm_flags |= VM_NORESERVE;

1338

1339

/* hugetlb applies strict overcommit unless MAP_NORESERVE */

1339

/* hugetlb applies strict overcommit unless MAP_NORESERVE */

1340

if (file && is_file_hugepages(file))

1340

if (file && is_file_hugepages(file))

1341

vm_flags |= VM_NORESERVE;

1341

vm_flags |= VM_NORESERVE;

1342

}

1342

}

1343

1344

addr = mmap_region(file, addr, len, vm_flags, pgoff);

1344

addr = mmap_region(file, addr, len, vm_flags, pgoff);

1345

if (!IS_ERR_VALUE(addr) &&

1345

if (!IS_ERR_VALUE(addr) &&

1346

((vm_flags & VM_LOCKED) ||

1346

((vm_flags & VM_LOCKED) ||

1347

(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))

1347

(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))

1348

*populate = len;

1348

*populate = len;

1349

return addr;

1349

return addr;

1350

}

1350

}

1351

1352

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,

1352

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,

1353

unsigned long, prot, unsigned long, flags,

1353

unsigned long, prot, unsigned long, flags,

1354

unsigned long, fd, unsigned long, pgoff)

1354

unsigned long, fd, unsigned long, pgoff)

1355

{

1355

{

1356

struct file *file = NULL;

1356

struct file *file = NULL;

1357

unsigned long retval = -EBADF;

1357

unsigned long retval = -EBADF;

1358

1359

if (!(flags & MAP_ANONYMOUS)) {

1359

if (!(flags & MAP_ANONYMOUS)) {

1360

audit_mmap_fd(fd, flags);

1360

audit_mmap_fd(fd, flags);

1361

if (unlikely(flags & MAP_HUGETLB))

1361

if (unlikely(flags & MAP_HUGETLB))

1362

return -EINVAL;

1362

return -EINVAL;

1363

file = fget(fd);

1363

file = fget(fd);

1364

if (!file)

1364

if (!file)

1365

goto out;

1365

goto out;

1366

if (is_file_hugepages(file))

1367

len = ALIGN(len, huge_page_size(hstate_file(file)));

1366

} else if (flags & MAP_HUGETLB) {

1368

} else if (flags & MAP_HUGETLB) {

1367

struct user_struct *user = NULL;

1369

struct user_struct *user = NULL;

1370

1371

len = ALIGN(len, huge_page_size(hstate_sizelog(

1372

(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));

1368

/*

1373

/*

1369

* VM_NORESERVE is used because the reservations will be

1374

* VM_NORESERVE is used because the reservations will be

1370

* taken when vm_ops->mmap() is called

1375

* taken when vm_ops->mmap() is called

1371

* A dummy user value is used because we are not locking

1376

* A dummy user value is used because we are not locking

1372

* memory so no accounting is necessary

1377

* memory so no accounting is necessary

1373

*/

1378

*/

1374

file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,

1379

file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,

1375

VM_NORESERVE,

1380

VM_NORESERVE,

1376

&user, HUGETLB_ANONHUGE_INODE,

1381

&user, HUGETLB_ANONHUGE_INODE,

1377

(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);

1382

(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);

1378

if (IS_ERR(file))

1383

if (IS_ERR(file))

1379

return PTR_ERR(file);

1384

return PTR_ERR(file);

1380

}

1385

}

1381

1386

1382

flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

1387

flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

1383

1388

1384

retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);

1389

retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);

1385

if (file)

1390

if (file)

1386

fput(file);

1391

fput(file);

1387

out:

1392

out:

1388

return retval;

1393

return retval;

1389

}

1394

}

1390

1395

1391

#ifdef __ARCH_WANT_SYS_OLD_MMAP

1396

#ifdef __ARCH_WANT_SYS_OLD_MMAP

1392

struct mmap_arg_struct {

1397

struct mmap_arg_struct {

1393

unsigned long addr;

1398

unsigned long addr;

1394

unsigned long len;

1399

unsigned long len;

1395

unsigned long prot;

1400

unsigned long prot;

1396

unsigned long flags;

1401

unsigned long flags;

1397

unsigned long fd;

1402

unsigned long fd;

1398

unsigned long offset;

1403

unsigned long offset;

1399

};

1404

};

1400

1405

1401

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)

1406

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)

1402

{

1407

{

1403

struct mmap_arg_struct a;

1408

struct mmap_arg_struct a;

1404

1409

1405

if (copy_from_user(&a, arg, sizeof(a)))

1410

if (copy_from_user(&a, arg, sizeof(a)))

1406

return -EFAULT;

1411

return -EFAULT;

1407

if (a.offset & ~PAGE_MASK)

1412

if (a.offset & ~PAGE_MASK)

1408

return -EINVAL;

1413

return -EINVAL;

1409

1414

1410

return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,

1415

return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,

1411

a.offset >> PAGE_SHIFT);

1416

a.offset >> PAGE_SHIFT);

1412

}

1417

}

1413

#endif /* __ARCH_WANT_SYS_OLD_MMAP */

1418

#endif /* __ARCH_WANT_SYS_OLD_MMAP */

1414

1419

1415

/*

1420

/*

1416

* Some shared mappigns will want the pages marked read-only

1421

* Some shared mappigns will want the pages marked read-only

1417

* to track write events. If so, we'll downgrade vm_page_prot

1422

* to track write events. If so, we'll downgrade vm_page_prot

1418

* to the private version (using protection_map[] without the

1423

* to the private version (using protection_map[] without the

1419

* VM_SHARED bit).

1424

* VM_SHARED bit).

1420

*/

1425

*/

1421

int vma_wants_writenotify(struct vm_area_struct *vma)

1426

int vma_wants_writenotify(struct vm_area_struct *vma)

1422

{

1427

{

1423

vm_flags_t vm_flags = vma->vm_flags;

1428

vm_flags_t vm_flags = vma->vm_flags;

1424

1429

1425

/* If it was private or non-writable, the write bit is already clear */

1430

/* If it was private or non-writable, the write bit is already clear */

1426

if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))

1431

if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))

1427

return 0;

1432

return 0;

1428

1433

1429

/* The backer wishes to know when pages are first written to? */

1434

/* The backer wishes to know when pages are first written to? */

1430

if (vma->vm_ops && vma->vm_ops->page_mkwrite)

1435

if (vma->vm_ops && vma->vm_ops->page_mkwrite)

1431

return 1;

1436

return 1;

1432

1437

1433

/* The open routine did something to the protections already? */

1438

/* The open routine did something to the protections already? */

1434

if (pgprot_val(vma->vm_page_prot) !=

1439

if (pgprot_val(vma->vm_page_prot) !=

1435

pgprot_val(vm_get_page_prot(vm_flags)))

1440

pgprot_val(vm_get_page_prot(vm_flags)))

1436

return 0;

1441

return 0;

1437

1442

1438

/* Specialty mapping? */

1443

/* Specialty mapping? */

1439

if (vm_flags & VM_PFNMAP)

1444

if (vm_flags & VM_PFNMAP)

1440

return 0;

1445

return 0;

1441

1446

1442

/* Can the mapping track the dirty pages? */

1447

/* Can the mapping track the dirty pages? */

1443

return vma->vm_file && vma->vm_file->f_mapping &&

1448

return vma->vm_file && vma->vm_file->f_mapping &&

1444

mapping_cap_account_dirty(vma->vm_file->f_mapping);

1449

mapping_cap_account_dirty(vma->vm_file->f_mapping);

1445

}

1450

}

1446

1451

1447

/*

1452

/*

1448

* We account for memory if it's a private writeable mapping,

1453

* We account for memory if it's a private writeable mapping,

1449

* not hugepages and VM_NORESERVE wasn't set.

1454

* not hugepages and VM_NORESERVE wasn't set.

1450

*/

1455

*/

1451

static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)

1456

static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)

1452

{

1457

{

1453

/*

1458

/*

1454

* hugetlb has its own accounting separate from the core VM

1459

* hugetlb has its own accounting separate from the core VM

1455

* VM_HUGETLB may not be set yet so we cannot check for that flag.

1460

* VM_HUGETLB may not be set yet so we cannot check for that flag.

1456

*/

1461

*/

1457

if (file && is_file_hugepages(file))

1462

if (file && is_file_hugepages(file))

1458

return 0;

1463

return 0;

1459

1464

1460

return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;

1465

return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;

1461

}

1466

}

1462

1467

1463

unsigned long mmap_region(struct file *file, unsigned long addr,

1468

unsigned long mmap_region(struct file *file, unsigned long addr,

1464

unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)

1469

unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)

1465

{

1470

{

1466

struct mm_struct *mm = current->mm;

1471

struct mm_struct *mm = current->mm;

1467

struct vm_area_struct *vma, *prev;

1472

struct vm_area_struct *vma, *prev;

1468

int correct_wcount = 0;

1473

int correct_wcount = 0;

1469

int error;

1474

int error;

1470

struct rb_node **rb_link, *rb_parent;

1475

struct rb_node **rb_link, *rb_parent;

1471

unsigned long charged = 0;

1476

unsigned long charged = 0;

1472

struct inode *inode = file ? file_inode(file) : NULL;

1477

struct inode *inode = file ? file_inode(file) : NULL;

1473

1478

1474

/* Check against address space limit. */

1479

/* Check against address space limit. */

1475

if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {

1480

if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {

1476

unsigned long nr_pages;

1481

unsigned long nr_pages;

1477

1482

1478

/*

1483

/*

1479

* MAP_FIXED may remove pages of mappings that intersects with

1484

* MAP_FIXED may remove pages of mappings that intersects with

1480

* requested mapping. Account for the pages it would unmap.

1485

* requested mapping. Account for the pages it would unmap.

1481

*/

1486

*/

1482

if (!(vm_flags & MAP_FIXED))

1487

if (!(vm_flags & MAP_FIXED))

1483

return -ENOMEM;

1488

return -ENOMEM;

1484

1489

1485

nr_pages = count_vma_pages_range(mm, addr, addr + len);

1490

nr_pages = count_vma_pages_range(mm, addr, addr + len);

1486

1491

1487

if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))

1492

if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))

1488

return -ENOMEM;

1493

return -ENOMEM;

1489

}

1494

}

1490

1495

1491

/* Clear old maps */

1496

/* Clear old maps */

1492

error = -ENOMEM;

1497

error = -ENOMEM;

1493

munmap_back:

1498

munmap_back:

1494

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {

1499

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {

1495

if (do_munmap(mm, addr, len))

1500

if (do_munmap(mm, addr, len))

1496

return -ENOMEM;

1501

return -ENOMEM;

1497

goto munmap_back;

1502

goto munmap_back;

1498

}

1503

}

1499

1504

1500

/*

1505

/*

1501

* Private writable mapping: check memory availability

1506

* Private writable mapping: check memory availability

1502

*/

1507

*/

1503

if (accountable_mapping(file, vm_flags)) {

1508

if (accountable_mapping(file, vm_flags)) {

1504

charged = len >> PAGE_SHIFT;

1509

charged = len >> PAGE_SHIFT;

1505

if (security_vm_enough_memory_mm(mm, charged))

1510

if (security_vm_enough_memory_mm(mm, charged))

1506

return -ENOMEM;

1511

return -ENOMEM;

1507

vm_flags |= VM_ACCOUNT;

1512

vm_flags |= VM_ACCOUNT;

1508

}

1513

}

1509

1514

1510

/*

1515

/*

1511

* Can we just expand an old mapping?

1516

* Can we just expand an old mapping?

1512

*/

1517

*/

1513

vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);

1518

vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);

1514

if (vma)

1519

if (vma)

1515

goto out;

1520

goto out;

1516

1521

1517

/*

1522

/*

1518

* Determine the object being mapped and call the appropriate

1523

* Determine the object being mapped and call the appropriate

1519

* specific mapper. the address has already been validated, but

1524

* specific mapper. the address has already been validated, but

1520

* not unmapped, but the maps are removed from the list.

1525

* not unmapped, but the maps are removed from the list.

1521

*/

1526

*/

1522

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

1527

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

1523

if (!vma) {

1528

if (!vma) {

1524

error = -ENOMEM;

1529

error = -ENOMEM;

1525

goto unacct_error;

1530

goto unacct_error;

1526

}

1531

}

1527

1532

1528

vma->vm_mm = mm;

1533

vma->vm_mm = mm;

1529

vma->vm_start = addr;

1534

vma->vm_start = addr;

1530

vma->vm_end = addr + len;

1535

vma->vm_end = addr + len;

1531

vma->vm_flags = vm_flags;

1536

vma->vm_flags = vm_flags;

1532

vma->vm_page_prot = vm_get_page_prot(vm_flags);

1537

vma->vm_page_prot = vm_get_page_prot(vm_flags);

1533

vma->vm_pgoff = pgoff;

1538

vma->vm_pgoff = pgoff;

1534

INIT_LIST_HEAD(&vma->anon_vma_chain);

1539

INIT_LIST_HEAD(&vma->anon_vma_chain);

1535

1540

1536

error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */

1541

error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */

1537

1542

1538

if (file) {

1543

if (file) {

1539

if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))

1544

if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))

1540

goto free_vma;

1545

goto free_vma;

1541

if (vm_flags & VM_DENYWRITE) {

1546

if (vm_flags & VM_DENYWRITE) {

1542

error = deny_write_access(file);

1547

error = deny_write_access(file);

1543

if (error)

1548

if (error)

1544

goto free_vma;

1549

goto free_vma;

1545

correct_wcount = 1;

1550

correct_wcount = 1;

1546

}

1551

}

1547

vma->vm_file = get_file(file);

1552

vma->vm_file = get_file(file);

1548

error = file->f_op->mmap(file, vma);

1553

error = file->f_op->mmap(file, vma);

1549

if (error)

1554

if (error)

1550

goto unmap_and_free_vma;

1555

goto unmap_and_free_vma;

1551

1556

1552

/* Can addr have changed??

1557

/* Can addr have changed??

1553

*

1558

*

1554

* Answer: Yes, several device drivers can do it in their

1559

* Answer: Yes, several device drivers can do it in their

1555

* f_op->mmap method. -DaveM

1560

* f_op->mmap method. -DaveM

1556

* Bug: If addr is changed, prev, rb_link, rb_parent should

1561

* Bug: If addr is changed, prev, rb_link, rb_parent should

1557

* be updated for vma_link()

1562

* be updated for vma_link()

1558

*/

1563

*/

1559

WARN_ON_ONCE(addr != vma->vm_start);

1564

WARN_ON_ONCE(addr != vma->vm_start);

1560

1565

1561

addr = vma->vm_start;

1566

addr = vma->vm_start;

1562

pgoff = vma->vm_pgoff;

1567

pgoff = vma->vm_pgoff;

1563

vm_flags = vma->vm_flags;

1568

vm_flags = vma->vm_flags;

1564

} else if (vm_flags & VM_SHARED) {

1569

} else if (vm_flags & VM_SHARED) {

1565

if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))

1570

if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))

1566

goto free_vma;

1571

goto free_vma;

1567

error = shmem_zero_setup(vma);

1572

error = shmem_zero_setup(vma);

1568

if (error)

1573

if (error)

1569

goto free_vma;

1574

goto free_vma;

1570

}

1575

}

1571

1576

1572

if (vma_wants_writenotify(vma)) {

1577

if (vma_wants_writenotify(vma)) {

1573

pgprot_t pprot = vma->vm_page_prot;

1578

pgprot_t pprot = vma->vm_page_prot;

1574

1579

1575

/* Can vma->vm_page_prot have changed??

1580

/* Can vma->vm_page_prot have changed??

1576

*

1581

*

1577

* Answer: Yes, drivers may have changed it in their

1582

* Answer: Yes, drivers may have changed it in their

1578

* f_op->mmap method.

1583

* f_op->mmap method.

1579

*

1584

*

1580

* Ensures that vmas marked as uncached stay that way.

1585

* Ensures that vmas marked as uncached stay that way.

1581

*/

1586

*/

1582

vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);

1587

vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);

1583

if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))

1588

if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))

1584

vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

1589

vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

1585

}

1590

}

1586

1591

1587

vma_link(mm, vma, prev, rb_link, rb_parent);

1592

vma_link(mm, vma, prev, rb_link, rb_parent);

1588

file = vma->vm_file;

1593

file = vma->vm_file;

1589

1594

1590

/* Once vma denies write, undo our temporary denial count */

1595

/* Once vma denies write, undo our temporary denial count */

1591

if (correct_wcount)

1596

if (correct_wcount)

1592

atomic_inc(&inode->i_writecount);

1597

atomic_inc(&inode->i_writecount);

1593

out:

1598

out:

1594

perf_event_mmap(vma);

1599

perf_event_mmap(vma);

1595

1600

1596

vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);

1601

vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);

1597

if (vm_flags & VM_LOCKED) {

1602

if (vm_flags & VM_LOCKED) {

1598

if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||

1603

if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||

1599

vma == get_gate_vma(current->mm)))

1604

vma == get_gate_vma(current->mm)))

1600

mm->locked_vm += (len >> PAGE_SHIFT);

1605

mm->locked_vm += (len >> PAGE_SHIFT);

1601

else

1606

else

1602

vma->vm_flags &= ~VM_LOCKED;

1607

vma->vm_flags &= ~VM_LOCKED;

1603

}

1608

}

1604

1609

1605

if (file)

1610

if (file)

1606

uprobe_mmap(vma);

1611

uprobe_mmap(vma);

1607

1612

1608

return addr;

1613

return addr;

1609

1614

1610

unmap_and_free_vma:

1615

unmap_and_free_vma:

1611

if (correct_wcount)

1616

if (correct_wcount)

1612

atomic_inc(&inode->i_writecount);

1617

atomic_inc(&inode->i_writecount);

1613

vma->vm_file = NULL;

1618

vma->vm_file = NULL;

1614

fput(file);

1619

fput(file);

1615

1620

1616

/* Undo any partial mapping done by a device driver. */

1621

/* Undo any partial mapping done by a device driver. */

1617

unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);

1622

unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);

1618

charged = 0;

1623

charged = 0;

1619

free_vma:

1624

free_vma:

1620

kmem_cache_free(vm_area_cachep, vma);

1625

kmem_cache_free(vm_area_cachep, vma);

1621

unacct_error:

1626

unacct_error:

1622

if (charged)

1627

if (charged)

1623

vm_unacct_memory(charged);

1628

vm_unacct_memory(charged);

1624

return error;

1629

return error;

1625

}

1630

}

1626

1631

1627

unsigned long unmapped_area(struct vm_unmapped_area_info *info)

1632

unsigned long unmapped_area(struct vm_unmapped_area_info *info)

1628

{

1633

{

1629

/*

1634

/*

1630

* We implement the search by looking for an rbtree node that

1635

* We implement the search by looking for an rbtree node that

1631

* immediately follows a suitable gap. That is,

1636

* immediately follows a suitable gap. That is,

1632

* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;

1637

* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;

1633

* - gap_end = vma->vm_start >= info->low_limit + length;

1638

* - gap_end = vma->vm_start >= info->low_limit + length;

1634

* - gap_end - gap_start >= length

1639

* - gap_end - gap_start >= length

1635

*/

1640

*/

1636

1641

1637

struct mm_struct *mm = current->mm;

1642

struct mm_struct *mm = current->mm;

1638

struct vm_area_struct *vma;

1643

struct vm_area_struct *vma;

1639

unsigned long length, low_limit, high_limit, gap_start, gap_end;

1644

unsigned long length, low_limit, high_limit, gap_start, gap_end;

1640

1645

1641

/* Adjust search length to account for worst case alignment overhead */

1646

/* Adjust search length to account for worst case alignment overhead */

1642

length = info->length + info->align_mask;

1647

length = info->length + info->align_mask;

1643

if (length < info->length)

1648

if (length < info->length)

1644

return -ENOMEM;

1649

return -ENOMEM;

1645

1650

1646

/* Adjust search limits by the desired length */

1651

/* Adjust search limits by the desired length */

1647

if (info->high_limit < length)

1652

if (info->high_limit < length)

1648

return -ENOMEM;

1653

return -ENOMEM;

1649

high_limit = info->high_limit - length;

1654

high_limit = info->high_limit - length;

1650

1655

1651

if (info->low_limit > high_limit)

1656

if (info->low_limit > high_limit)

1652

return -ENOMEM;

1657

return -ENOMEM;

1653

low_limit = info->low_limit + length;

1658

low_limit = info->low_limit + length;

1654

1659

1655

/* Check if rbtree root looks promising */

1660

/* Check if rbtree root looks promising */

1656

if (RB_EMPTY_ROOT(&mm->mm_rb))

1661

if (RB_EMPTY_ROOT(&mm->mm_rb))

1657

goto check_highest;

1662

goto check_highest;

1658

vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);

1663

vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);

1659

if (vma->rb_subtree_gap < length)

1664

if (vma->rb_subtree_gap < length)

1660

goto check_highest;

1665

goto check_highest;

1661

1666

1662

while (true) {

1667

while (true) {

1663

/* Visit left subtree if it looks promising */

1668

/* Visit left subtree if it looks promising */

1664

gap_end = vma->vm_start;

1669

gap_end = vma->vm_start;

1665

if (gap_end >= low_limit && vma->vm_rb.rb_left) {

1670

if (gap_end >= low_limit && vma->vm_rb.rb_left) {

1666

struct vm_area_struct *left =

1671

struct vm_area_struct *left =

1667

rb_entry(vma->vm_rb.rb_left,

1672

rb_entry(vma->vm_rb.rb_left,

1668

struct vm_area_struct, vm_rb);

1673

struct vm_area_struct, vm_rb);

1669

if (left->rb_subtree_gap >= length) {

1674

if (left->rb_subtree_gap >= length) {

1670

vma = left;

1675

vma = left;

1671

continue;

1676

continue;

1672

}

1677

}

1673

}

1678

}

1674

1679

1675

gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;

1680

gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;

1676

check_current:

1681

check_current:

1677

/* Check if current node has a suitable gap */

1682

/* Check if current node has a suitable gap */

1678

if (gap_start > high_limit)

1683

if (gap_start > high_limit)

1679

return -ENOMEM;

1684

return -ENOMEM;

1680

if (gap_end >= low_limit && gap_end - gap_start >= length)

1685

if (gap_end >= low_limit && gap_end - gap_start >= length)

1681

goto found;

1686

goto found;

1682

1687

1683

/* Visit right subtree if it looks promising */

1688

/* Visit right subtree if it looks promising */

1684

if (vma->vm_rb.rb_right) {

1689

if (vma->vm_rb.rb_right) {

1685

struct vm_area_struct *right =

1690

struct vm_area_struct *right =

1686

rb_entry(vma->vm_rb.rb_right,

1691

rb_entry(vma->vm_rb.rb_right,

1687

struct vm_area_struct, vm_rb);

1692

struct vm_area_struct, vm_rb);

1688

if (right->rb_subtree_gap >= length) {

1693

if (right->rb_subtree_gap >= length) {

1689

vma = right;

1694

vma = right;

1690

continue;

1695

continue;

1691

}

1696

}

1692

}

1697

}

1693

1698

1694

/* Go back up the rbtree to find next candidate node */

1699

/* Go back up the rbtree to find next candidate node */

1695

while (true) {

1700

while (true) {

1696

struct rb_node *prev = &vma->vm_rb;

1701

struct rb_node *prev = &vma->vm_rb;

1697

if (!rb_parent(prev))

1702

if (!rb_parent(prev))

1698

goto check_highest;

1703

goto check_highest;

1699

vma = rb_entry(rb_parent(prev),

1704

vma = rb_entry(rb_parent(prev),

1700

struct vm_area_struct, vm_rb);

1705

struct vm_area_struct, vm_rb);

1701

if (prev == vma->vm_rb.rb_left) {

1706

if (prev == vma->vm_rb.rb_left) {

1702

gap_start = vma->vm_prev->vm_end;

1707

gap_start = vma->vm_prev->vm_end;

1703

gap_end = vma->vm_start;

1708

gap_end = vma->vm_start;

1704

goto check_current;

1709

goto check_current;

1705

}

1710

}

1706

}

1711

}

1707

}

1712

}

1708

1713

1709

check_highest:

1714

check_highest:

1710

/* Check highest gap, which does not precede any rbtree node */

1715

/* Check highest gap, which does not precede any rbtree node */

1711

gap_start = mm->highest_vm_end;

1716

gap_start = mm->highest_vm_end;

1712

gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */

1717

gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */

1713

if (gap_start > high_limit)

1718

if (gap_start > high_limit)

1714

return -ENOMEM;

1719

return -ENOMEM;

1715

1720

1716

found:

1721

found:

1717

/* We found a suitable gap. Clip it with the original low_limit. */

1722

/* We found a suitable gap. Clip it with the original low_limit. */

1718

if (gap_start < info->low_limit)

1723

if (gap_start < info->low_limit)

1719

gap_start = info->low_limit;

1724

gap_start = info->low_limit;

1720

1725

1721

/* Adjust gap address to the desired alignment */

1726

/* Adjust gap address to the desired alignment */

1722

gap_start += (info->align_offset - gap_start) & info->align_mask;

1727

gap_start += (info->align_offset - gap_start) & info->align_mask;

1723

1728

1724

VM_BUG_ON(gap_start + info->length > info->high_limit);

1729

VM_BUG_ON(gap_start + info->length > info->high_limit);

1725

VM_BUG_ON(gap_start + info->length > gap_end);

1730

VM_BUG_ON(gap_start + info->length > gap_end);

1726

return gap_start;

1731

return gap_start;

1727

}

1732

}

1728

1733

1729

unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)

1734

unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)

1730

{

1735

{

1731

struct mm_struct *mm = current->mm;

1736

struct mm_struct *mm = current->mm;

1732

struct vm_area_struct *vma;

1737

struct vm_area_struct *vma;

1733

unsigned long length, low_limit, high_limit, gap_start, gap_end;

1738

unsigned long length, low_limit, high_limit, gap_start, gap_end;

1734

1739

1735

/* Adjust search length to account for worst case alignment overhead */

1740

/* Adjust search length to account for worst case alignment overhead */

1736

length = info->length + info->align_mask;

1741

length = info->length + info->align_mask;

1737

if (length < info->length)

1742

if (length < info->length)

1738

return -ENOMEM;

1743

return -ENOMEM;

1739

1744

1740

/*

1745

/*

1741

* Adjust search limits by the desired length.

1746

* Adjust search limits by the desired length.

1742

* See implementation comment at top of unmapped_area().

1747

* See implementation comment at top of unmapped_area().

1743

*/

1748

*/

1744

gap_end = info->high_limit;

1749

gap_end = info->high_limit;

1745

if (gap_end < length)

1750

if (gap_end < length)

1746

return -ENOMEM;

1751

return -ENOMEM;

1747

high_limit = gap_end - length;

1752

high_limit = gap_end - length;

1748

1753

1749

if (info->low_limit > high_limit)

1754

if (info->low_limit > high_limit)

1750

return -ENOMEM;

1755

return -ENOMEM;

1751

low_limit = info->low_limit + length;

1756

low_limit = info->low_limit + length;

1752

1757

1753

/* Check highest gap, which does not precede any rbtree node */

1758

/* Check highest gap, which does not precede any rbtree node */

1754

gap_start = mm->highest_vm_end;

1759

gap_start = mm->highest_vm_end;

1755

if (gap_start <= high_limit)

1760

if (gap_start <= high_limit)

1756

goto found_highest;

1761

goto found_highest;

1757

1762

1758

/* Check if rbtree root looks promising */

1763

/* Check if rbtree root looks promising */

1759

if (RB_EMPTY_ROOT(&mm->mm_rb))

1764

if (RB_EMPTY_ROOT(&mm->mm_rb))

1760

return -ENOMEM;

1765

return -ENOMEM;

1761

vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);

1766

vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);

1762

if (vma->rb_subtree_gap < length)

1767

if (vma->rb_subtree_gap < length)

1763

return -ENOMEM;

1768

return -ENOMEM;

1764

1769

1765

while (true) {

1770

while (true) {

1766

/* Visit right subtree if it looks promising */

1771

/* Visit right subtree if it looks promising */

1767

gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;

1772

gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;

1768

if (gap_start <= high_limit && vma->vm_rb.rb_right) {

1773

if (gap_start <= high_limit && vma->vm_rb.rb_right) {

1769

struct vm_area_struct *right =

1774

struct vm_area_struct *right =

1770

rb_entry(vma->vm_rb.rb_right,

1775

rb_entry(vma->vm_rb.rb_right,

1771

struct vm_area_struct, vm_rb);

1776

struct vm_area_struct, vm_rb);

1772

if (right->rb_subtree_gap >= length) {

1777

if (right->rb_subtree_gap >= length) {

1773

vma = right;

1778

vma = right;

1774

continue;

1779

continue;

1775

}

1780

}

1776

}

1781

}

1777

1782

1778

check_current:

1783

check_current:

1779

/* Check if current node has a suitable gap */

1784

/* Check if current node has a suitable gap */

1780

gap_end = vma->vm_start;

1785

gap_end = vma->vm_start;

1781

if (gap_end < low_limit)

1786

if (gap_end < low_limit)

1782

return -ENOMEM;

1787

return -ENOMEM;

1783

if (gap_start <= high_limit && gap_end - gap_start >= length)

1788

if (gap_start <= high_limit && gap_end - gap_start >= length)

1784

goto found;

1789

goto found;

1785

1790

1786

/* Visit left subtree if it looks promising */

1791

/* Visit left subtree if it looks promising */

1787

if (vma->vm_rb.rb_left) {

1792

if (vma->vm_rb.rb_left) {

1788

struct vm_area_struct *left =

1793

struct vm_area_struct *left =

1789

rb_entry(vma->vm_rb.rb_left,

1794

rb_entry(vma->vm_rb.rb_left,

1790

struct vm_area_struct, vm_rb);

1795

struct vm_area_struct, vm_rb);

1791

if (left->rb_subtree_gap >= length) {

1796

if (left->rb_subtree_gap >= length) {

1792

vma = left;

1797

vma = left;

1793

continue;

1798

continue;

1794

}

1799

}

1795

}

1800

}

1796

1801

1797

/* Go back up the rbtree to find next candidate node */

1802

/* Go back up the rbtree to find next candidate node */

1798

while (true) {

1803

while (true) {

1799

struct rb_node *prev = &vma->vm_rb;

1804

struct rb_node *prev = &vma->vm_rb;

1800

if (!rb_parent(prev))

1805

if (!rb_parent(prev))

1801

return -ENOMEM;

1806

return -ENOMEM;

1802

vma = rb_entry(rb_parent(prev),

1807

vma = rb_entry(rb_parent(prev),

1803

struct vm_area_struct, vm_rb);

1808

struct vm_area_struct, vm_rb);

1804

if (prev == vma->vm_rb.rb_right) {

1809

if (prev == vma->vm_rb.rb_right) {

1805

gap_start = vma->vm_prev ?

1810

gap_start = vma->vm_prev ?

1806

vma->vm_prev->vm_end : 0;

1811

vma->vm_prev->vm_end : 0;

1807

goto check_current;

1812

goto check_current;

1808

}

1813

}

1809

}

1814

}

1810

}

1815

}

1811

1816

1812

found:

1817

found:

1813

/* We found a suitable gap. Clip it with the original high_limit. */

1818

/* We found a suitable gap. Clip it with the original high_limit. */

1814

if (gap_end > info->high_limit)

1819

if (gap_end > info->high_limit)

1815

gap_end = info->high_limit;

1820

gap_end = info->high_limit;

1816

1821

1817

found_highest:

1822

found_highest:

1818

/* Compute highest gap address at the desired alignment */

1823

/* Compute highest gap address at the desired alignment */

1819

gap_end -= info->length;

1824

gap_end -= info->length;

1820

gap_end -= (gap_end - info->align_offset) & info->align_mask;

1825

gap_end -= (gap_end - info->align_offset) & info->align_mask;

1821

1826

1822

VM_BUG_ON(gap_end < info->low_limit);

1827

VM_BUG_ON(gap_end < info->low_limit);

1823

VM_BUG_ON(gap_end < gap_start);

1828

VM_BUG_ON(gap_end < gap_start);

1824

return gap_end;

1829

return gap_end;

1825

}

1830

}

1826

1831

1827

/* Get an address range which is currently unmapped.

1832

/* Get an address range which is currently unmapped.

1828

* For shmat() with addr=0.

1833

* For shmat() with addr=0.

1829

*

1834

*

1830

* Ugly calling convention alert:

1835

* Ugly calling convention alert:

1831

* Return value with the low bits set means error value,

1836

* Return value with the low bits set means error value,

1832

* ie

1837

* ie

1833

* if (ret & ~PAGE_MASK)

1838

* if (ret & ~PAGE_MASK)

1834

* error = ret;

1839

* error = ret;

1835

*

1840

*

1836

* This function "knows" that -ENOMEM has the bits set.

1841

* This function "knows" that -ENOMEM has the bits set.

1837

*/

1842

*/

1838

#ifndef HAVE_ARCH_UNMAPPED_AREA

1843

#ifndef HAVE_ARCH_UNMAPPED_AREA

1839

unsigned long

1844

unsigned long

1840

arch_get_unmapped_area(struct file *filp, unsigned long addr,

1845

arch_get_unmapped_area(struct file *filp, unsigned long addr,

1841

unsigned long len, unsigned long pgoff, unsigned long flags)

1846

unsigned long len, unsigned long pgoff, unsigned long flags)

1842

{

1847

{

1843

struct mm_struct *mm = current->mm;

1848

struct mm_struct *mm = current->mm;

1844

struct vm_area_struct *vma;

1849

struct vm_area_struct *vma;

1845

struct vm_unmapped_area_info info;

1850

struct vm_unmapped_area_info info;

1846

1851

1847

if (len > TASK_SIZE)

1852

if (len > TASK_SIZE)

1848

return -ENOMEM;

1853

return -ENOMEM;

1849

1854

1850

if (flags & MAP_FIXED)

1855

if (flags & MAP_FIXED)

1851

return addr;

1856

return addr;

1852

1857

1853

if (addr) {

1858

if (addr) {

1854

addr = PAGE_ALIGN(addr);

1859

addr = PAGE_ALIGN(addr);

1855

vma = find_vma(mm, addr);

1860

vma = find_vma(mm, addr);

1856

if (TASK_SIZE - len >= addr &&

1861

if (TASK_SIZE - len >= addr &&

1857

(!vma || addr + len <= vma->vm_start))

1862

(!vma || addr + len <= vma->vm_start))

1858

return addr;

1863

return addr;

1859

}

1864

}

1860

1865

1861

info.flags = 0;

1866

info.flags = 0;

1862

info.length = len;

1867

info.length = len;

1863

info.low_limit = TASK_UNMAPPED_BASE;

1868

info.low_limit = TASK_UNMAPPED_BASE;

1864

info.high_limit = TASK_SIZE;

1869

info.high_limit = TASK_SIZE;

1865

info.align_mask = 0;

1870

info.align_mask = 0;

1866

return vm_unmapped_area(&info);

1871

return vm_unmapped_area(&info);

1867

}

1872

}

1868

#endif

1873

#endif

1869

1874

1870

void arch_unmap_area(struct mm_struct *mm, unsigned long addr)

1875

void arch_unmap_area(struct mm_struct *mm, unsigned long addr)

1871

{

1876

{

1872

/*

1877

/*

1873

* Is this a new hole at the lowest possible address?

1878

* Is this a new hole at the lowest possible address?

1874

*/

1879

*/

1875

if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)

1880

if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)

1876

mm->free_area_cache = addr;

1881

mm->free_area_cache = addr;

1877

}

1882

}

1878

1883

1879

/*

1884

/*

1880

* This mmap-allocator allocates new areas top-down from below the

1885

* This mmap-allocator allocates new areas top-down from below the

1881

* stack's low limit (the base):

1886

* stack's low limit (the base):

1882

*/

1887

*/

1883

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

1888

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

1884

unsigned long

1889

unsigned long

1885

arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,

1890

arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,

1886

const unsigned long len, const unsigned long pgoff,

1891

const unsigned long len, const unsigned long pgoff,

1887

const unsigned long flags)

1892

const unsigned long flags)

1888

{

1893

{

1889

struct vm_area_struct *vma;

1894

struct vm_area_struct *vma;

1890

struct mm_struct *mm = current->mm;

1895

struct mm_struct *mm = current->mm;

1891

unsigned long addr = addr0;

1896

unsigned long addr = addr0;

1892

struct vm_unmapped_area_info info;

1897

struct vm_unmapped_area_info info;

1893

1898

1894

/* requested length too big for entire address space */

1899

/* requested length too big for entire address space */

1895

if (len > TASK_SIZE)

1900

if (len > TASK_SIZE)

1896

return -ENOMEM;

1901

return -ENOMEM;

1897

1902

1898

if (flags & MAP_FIXED)

1903

if (flags & MAP_FIXED)

1899

return addr;

1904

return addr;

1900

1905

1901

/* requesting a specific address */

1906

/* requesting a specific address */

1902

if (addr) {

1907

if (addr) {

1903

addr = PAGE_ALIGN(addr);

1908

addr = PAGE_ALIGN(addr);

1904

vma = find_vma(mm, addr);

1909

vma = find_vma(mm, addr);

1905

if (TASK_SIZE - len >= addr &&

1910

if (TASK_SIZE - len >= addr &&

1906

(!vma || addr + len <= vma->vm_start))

1911

(!vma || addr + len <= vma->vm_start))

1907

return addr;

1912

return addr;

1908

}

1913

}

1909

1914

1910

info.flags = VM_UNMAPPED_AREA_TOPDOWN;

1915

info.flags = VM_UNMAPPED_AREA_TOPDOWN;

1911

info.length = len;

1916

info.length = len;

1912

info.low_limit = PAGE_SIZE;

1917

info.low_limit = PAGE_SIZE;

1913

info.high_limit = mm->mmap_base;

1918

info.high_limit = mm->mmap_base;

1914

info.align_mask = 0;

1919

info.align_mask = 0;

1915

addr = vm_unmapped_area(&info);

1920

addr = vm_unmapped_area(&info);

1916

1921

1917

/*

1922

/*

1918

* A failed mmap() very likely causes application failure,

1923

* A failed mmap() very likely causes application failure,

1919

* so fall back to the bottom-up function here. This scenario

1924

* so fall back to the bottom-up function here. This scenario

1920

* can happen with large stack limits and large mmap()

1925

* can happen with large stack limits and large mmap()

1921

* allocations.

1926

* allocations.

1922

*/

1927

*/

1923

if (addr & ~PAGE_MASK) {

1928

if (addr & ~PAGE_MASK) {

1924

VM_BUG_ON(addr != -ENOMEM);

1929

VM_BUG_ON(addr != -ENOMEM);

1925

info.flags = 0;

1930

info.flags = 0;

1926

info.low_limit = TASK_UNMAPPED_BASE;

1931

info.low_limit = TASK_UNMAPPED_BASE;

1927

info.high_limit = TASK_SIZE;

1932

info.high_limit = TASK_SIZE;

1928

addr = vm_unmapped_area(&info);

1933

addr = vm_unmapped_area(&info);

1929

}

1934

}

1930

1935

1931

return addr;

1936

return addr;

1932

}

1937

}

1933

#endif

1938

#endif

1934

1939

1935

void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)

1940

void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)

1936

{

1941

{

1937

/*

1942

/*

1938

* Is this a new hole at the highest possible address?

1943

* Is this a new hole at the highest possible address?

1939

*/

1944

*/

1940

if (addr > mm->free_area_cache)

1945

if (addr > mm->free_area_cache)

1941

mm->free_area_cache = addr;

1946

mm->free_area_cache = addr;

1942

1947

1943

/* dont allow allocations above current base */

1948

/* dont allow allocations above current base */

1944

if (mm->free_area_cache > mm->mmap_base)

1949

if (mm->free_area_cache > mm->mmap_base)

1945

mm->free_area_cache = mm->mmap_base;

1950

mm->free_area_cache = mm->mmap_base;

1946

}

1951

}

1947

1952

1948

unsigned long

1953

unsigned long

1949

get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,

1954

get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,

1950

unsigned long pgoff, unsigned long flags)

1955

unsigned long pgoff, unsigned long flags)

1951

{

1956

{

1952

unsigned long (*get_area)(struct file *, unsigned long,

1957

unsigned long (*get_area)(struct file *, unsigned long,

1953

unsigned long, unsigned long, unsigned long);

1958

unsigned long, unsigned long, unsigned long);

1954

1959

1955

unsigned long error = arch_mmap_check(addr, len, flags);

1960

unsigned long error = arch_mmap_check(addr, len, flags);

1956

if (error)

1961

if (error)

1957

return error;

1962

return error;

1958

1963

1959

/* Careful about overflows.. */

1964

/* Careful about overflows.. */

1960

if (len > TASK_SIZE)

1965

if (len > TASK_SIZE)

1961

return -ENOMEM;

1966

return -ENOMEM;

1962

1967

1963

get_area = current->mm->get_unmapped_area;

1968

get_area = current->mm->get_unmapped_area;

1964

if (file && file->f_op && file->f_op->get_unmapped_area)

1969

if (file && file->f_op && file->f_op->get_unmapped_area)

1965

get_area = file->f_op->get_unmapped_area;

1970

get_area = file->f_op->get_unmapped_area;

1966

addr = get_area(file, addr, len, pgoff, flags);

1971

addr = get_area(file, addr, len, pgoff, flags);

1967

if (IS_ERR_VALUE(addr))

1972

if (IS_ERR_VALUE(addr))

1968

return addr;

1973

return addr;

1969

1974

1970

if (addr > TASK_SIZE - len)

1975

if (addr > TASK_SIZE - len)

1971

return -ENOMEM;

1976

return -ENOMEM;

1972

if (addr & ~PAGE_MASK)

1977

if (addr & ~PAGE_MASK)

1973

return -EINVAL;

1978

return -EINVAL;

1974

1979

1975

addr = arch_rebalance_pgtables(addr, len);

1980

addr = arch_rebalance_pgtables(addr, len);

1976

error = security_mmap_addr(addr);

1981

error = security_mmap_addr(addr);

1977

return error ? error : addr;

1982

return error ? error : addr;

1978

}

1983

}

1979

1984

1980

EXPORT_SYMBOL(get_unmapped_area);

1985

EXPORT_SYMBOL(get_unmapped_area);

1981

1986

1982

/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */

1987

/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */

1983

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)

1988

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)

1984

{

1989

{

1985

struct vm_area_struct *vma = NULL;

1990

struct vm_area_struct *vma = NULL;

1986

1991

1987

/* Check the cache first. */

1992

/* Check the cache first. */

1988

/* (Cache hit rate is typically around 35%.) */

1993

/* (Cache hit rate is typically around 35%.) */

1989

vma = ACCESS_ONCE(mm->mmap_cache);

1994

vma = ACCESS_ONCE(mm->mmap_cache);

1990

if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {

1995

if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {

1991

struct rb_node *rb_node;

1996

struct rb_node *rb_node;

1992

1997

1993

rb_node = mm->mm_rb.rb_node;

1998

rb_node = mm->mm_rb.rb_node;

1994

vma = NULL;

1999

vma = NULL;

1995

2000

1996

while (rb_node) {

2001

while (rb_node) {

1997

struct vm_area_struct *vma_tmp;

2002

struct vm_area_struct *vma_tmp;

1998

2003

1999

vma_tmp = rb_entry(rb_node,

2004

vma_tmp = rb_entry(rb_node,

2000

struct vm_area_struct, vm_rb);

2005

struct vm_area_struct, vm_rb);

2001

2006

2002

if (vma_tmp->vm_end > addr) {

2007

if (vma_tmp->vm_end > addr) {

2003

vma = vma_tmp;

2008

vma = vma_tmp;

2004

if (vma_tmp->vm_start <= addr)

2009

if (vma_tmp->vm_start <= addr)

2005

break;

2010

break;

2006

rb_node = rb_node->rb_left;

2011

rb_node = rb_node->rb_left;

2007

} else

2012

} else

2008

rb_node = rb_node->rb_right;

2013

rb_node = rb_node->rb_right;

2009

}

2014

}

2010

if (vma)

2015

if (vma)

2011

mm->mmap_cache = vma;

2016

mm->mmap_cache = vma;

2012

}

2017

}

2013

return vma;

2018

return vma;

2014

}

2019

}

2015

2020

2016

EXPORT_SYMBOL(find_vma);

2021

EXPORT_SYMBOL(find_vma);

2017

2022

2018

/*

2023

/*

2019

* Same as find_vma, but also return a pointer to the previous VMA in *pprev.

2024

* Same as find_vma, but also return a pointer to the previous VMA in *pprev.

2020

*/

2025

*/

2021

struct vm_area_struct *

2026

struct vm_area_struct *

2022

find_vma_prev(struct mm_struct *mm, unsigned long addr,

2027

find_vma_prev(struct mm_struct *mm, unsigned long addr,

2023

struct vm_area_struct **pprev)

2028

struct vm_area_struct **pprev)

2024

{

2029

{

2025

struct vm_area_struct *vma;

2030

struct vm_area_struct *vma;

2026

2031

2027

vma = find_vma(mm, addr);

2032

vma = find_vma(mm, addr);

2028

if (vma) {

2033

if (vma) {

2029

*pprev = vma->vm_prev;

2034

*pprev = vma->vm_prev;

2030

} else {

2035

} else {

2031

struct rb_node *rb_node = mm->mm_rb.rb_node;

2036

struct rb_node *rb_node = mm->mm_rb.rb_node;

2032

*pprev = NULL;

2037

*pprev = NULL;

2033

while (rb_node) {

2038

while (rb_node) {

2034

*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);

2039

*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);

2035

rb_node = rb_node->rb_right;

2040

rb_node = rb_node->rb_right;

2036

}

2041

}

2037

}

2042

}

2038

return vma;

2043

return vma;

2039

}

2044

}

2040

2045

2041

/*

2046

/*

2042

* Verify that the stack growth is acceptable and

2047

* Verify that the stack growth is acceptable and

2043

* update accounting. This is shared with both the

2048

* update accounting. This is shared with both the

2044

* grow-up and grow-down cases.

2049

* grow-up and grow-down cases.

2045

*/

2050

*/

2046

static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)

2051

static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)

2047

{

2052

{

2048

struct mm_struct *mm = vma->vm_mm;

2053

struct mm_struct *mm = vma->vm_mm;

2049

struct rlimit *rlim = current->signal->rlim;

2054

struct rlimit *rlim = current->signal->rlim;

2050

unsigned long new_start;

2055

unsigned long new_start;

2051

2056

2052

/* address space limit tests */

2057

/* address space limit tests */

2053

if (!may_expand_vm(mm, grow))

2058

if (!may_expand_vm(mm, grow))

2054

return -ENOMEM;

2059

return -ENOMEM;

2055

2060

2056

/* Stack limit test */

2061

/* Stack limit test */

2057

if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))

2062

if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))

2058

return -ENOMEM;

2063

return -ENOMEM;

2059

2064

2060

/* mlock limit tests */

2065

/* mlock limit tests */

2061

if (vma->vm_flags & VM_LOCKED) {

2066

if (vma->vm_flags & VM_LOCKED) {

2062

unsigned long locked;

2067

unsigned long locked;

2063

unsigned long limit;

2068

unsigned long limit;

2064

locked = mm->locked_vm + grow;

2069

locked = mm->locked_vm + grow;

2065

limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);

2070

limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);

2066

limit >>= PAGE_SHIFT;

2071

limit >>= PAGE_SHIFT;

2067

if (locked > limit && !capable(CAP_IPC_LOCK))

2072

if (locked > limit && !capable(CAP_IPC_LOCK))

2068

return -ENOMEM;

2073

return -ENOMEM;

2069

}

2074

}

2070

2075

2071

/* Check to ensure the stack will not grow into a hugetlb-only region */

2076

/* Check to ensure the stack will not grow into a hugetlb-only region */

2072

new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :

2077

new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :

2073

vma->vm_end - size;

2078

vma->vm_end - size;

2074

if (is_hugepage_only_range(vma->vm_mm, new_start, size))

2079

if (is_hugepage_only_range(vma->vm_mm, new_start, size))

2075

return -EFAULT;

2080

return -EFAULT;

2076

2081

2077

/*

2082

/*

2078

* Overcommit.. This must be the final test, as it will

2083

* Overcommit.. This must be the final test, as it will

2079

* update security statistics.

2084

* update security statistics.

2080

*/

2085

*/

2081

if (security_vm_enough_memory_mm(mm, grow))

2086

if (security_vm_enough_memory_mm(mm, grow))

2082

return -ENOMEM;

2087

return -ENOMEM;

2083

2088

2084

/* Ok, everything looks good - let it rip */

2089

/* Ok, everything looks good - let it rip */

2085

if (vma->vm_flags & VM_LOCKED)

2090

if (vma->vm_flags & VM_LOCKED)

2086

mm->locked_vm += grow;

2091

mm->locked_vm += grow;

2087

vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);

2092

vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);

2088

return 0;

2093

return 0;

2089

}

2094

}

2090

2095

2091

#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)

2096

#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)

2092

/*

2097

/*

2093

* PA-RISC uses this for its stack; IA64 for its Register Backing Store.

2098

* PA-RISC uses this for its stack; IA64 for its Register Backing Store.

2094

* vma is the last one with address > vma->vm_end. Have to extend vma.

2099

* vma is the last one with address > vma->vm_end. Have to extend vma.

2095

*/

2100

*/

2096

int expand_upwards(struct vm_area_struct *vma, unsigned long address)

2101

int expand_upwards(struct vm_area_struct *vma, unsigned long address)

2097

{

2102

{

2098

int error;

2103

int error;

2099

2104

2100

if (!(vma->vm_flags & VM_GROWSUP))

2105

if (!(vma->vm_flags & VM_GROWSUP))

2101

return -EFAULT;

2106

return -EFAULT;

2102

2107

2103

/*

2108

/*

2104

* We must make sure the anon_vma is allocated

2109

* We must make sure the anon_vma is allocated

2105

* so that the anon_vma locking is not a noop.

2110

* so that the anon_vma locking is not a noop.

2106

*/

2111

*/

2107

if (unlikely(anon_vma_prepare(vma)))

2112

if (unlikely(anon_vma_prepare(vma)))

2108

return -ENOMEM;

2113

return -ENOMEM;

2109

vma_lock_anon_vma(vma);

2114

vma_lock_anon_vma(vma);

2110

2115

2111

/*

2116

/*

2112

* vma->vm_start/vm_end cannot change under us because the caller

2117

* vma->vm_start/vm_end cannot change under us because the caller

2113

* is required to hold the mmap_sem in read mode. We need the

2118

* is required to hold the mmap_sem in read mode. We need the

2114

* anon_vma lock to serialize against concurrent expand_stacks.

2119

* anon_vma lock to serialize against concurrent expand_stacks.

2115

* Also guard against wrapping around to address 0.

2120

* Also guard against wrapping around to address 0.

2116

*/

2121

*/

2117

if (address < PAGE_ALIGN(address+4))

2122

if (address < PAGE_ALIGN(address+4))

2118

address = PAGE_ALIGN(address+4);

2123

address = PAGE_ALIGN(address+4);

2119

else {

2124

else {

2120

vma_unlock_anon_vma(vma);

2125

vma_unlock_anon_vma(vma);

2121

return -ENOMEM;

2126

return -ENOMEM;

2122

}

2127

}

2123

error = 0;

2128

error = 0;

2124

2129

2125

/* Somebody else might have raced and expanded it already */

2130

/* Somebody else might have raced and expanded it already */

2126

if (address > vma->vm_end) {

2131

if (address > vma->vm_end) {

2127

unsigned long size, grow;

2132

unsigned long size, grow;

2128

2133

2129

size = address - vma->vm_start;

2134

size = address - vma->vm_start;

2130

grow = (address - vma->vm_end) >> PAGE_SHIFT;

2135

grow = (address - vma->vm_end) >> PAGE_SHIFT;

2131

2136

2132

error = -ENOMEM;

2137

error = -ENOMEM;

2133

if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {

2138

if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {

2134

error = acct_stack_growth(vma, size, grow);

2139

error = acct_stack_growth(vma, size, grow);

2135

if (!error) {

2140

if (!error) {

2136

/*

2141

/*

2137

* vma_gap_update() doesn't support concurrent

2142

* vma_gap_update() doesn't support concurrent

2138

* updates, but we only hold a shared mmap_sem

2143

* updates, but we only hold a shared mmap_sem

2139

* lock here, so we need to protect against

2144

* lock here, so we need to protect against

2140

* concurrent vma expansions.

2145

* concurrent vma expansions.

2141

* vma_lock_anon_vma() doesn't help here, as

2146

* vma_lock_anon_vma() doesn't help here, as

2142

* we don't guarantee that all growable vmas

2147

* we don't guarantee that all growable vmas

2143

* in a mm share the same root anon vma.

2148

* in a mm share the same root anon vma.

2144

* So, we reuse mm->page_table_lock to guard

2149

* So, we reuse mm->page_table_lock to guard

2145

* against concurrent vma expansions.

2150

* against concurrent vma expansions.

2146

*/

2151

*/

2147

spin_lock(&vma->vm_mm->page_table_lock);

2152

spin_lock(&vma->vm_mm->page_table_lock);

2148

anon_vma_interval_tree_pre_update_vma(vma);

2153

anon_vma_interval_tree_pre_update_vma(vma);

2149

vma->vm_end = address;

2154

vma->vm_end = address;

2150

anon_vma_interval_tree_post_update_vma(vma);

2155

anon_vma_interval_tree_post_update_vma(vma);

2151

if (vma->vm_next)

2156

if (vma->vm_next)

2152

vma_gap_update(vma->vm_next);

2157

vma_gap_update(vma->vm_next);

2153

else

2158

else

2154

vma->vm_mm->highest_vm_end = address;

2159

vma->vm_mm->highest_vm_end = address;

2155

spin_unlock(&vma->vm_mm->page_table_lock);

2160

spin_unlock(&vma->vm_mm->page_table_lock);

2156

2161

2157

perf_event_mmap(vma);

2162

perf_event_mmap(vma);

2158

}

2163

}

2159

}

2164

}

2160

}

2165

}

2161

vma_unlock_anon_vma(vma);

2166

vma_unlock_anon_vma(vma);

2162

khugepaged_enter_vma_merge(vma);

2167

khugepaged_enter_vma_merge(vma);

2163

validate_mm(vma->vm_mm);

2168

validate_mm(vma->vm_mm);

2164

return error;

2169

return error;

2165

}

2170

}

2166

#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */

2171

#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */

2167

2172

2168

/*

2173

/*

2169

* vma is the first one with address < vma->vm_start. Have to extend vma.

2174

* vma is the first one with address < vma->vm_start. Have to extend vma.

2170

*/

2175

*/

2171

int expand_downwards(struct vm_area_struct *vma,

2176

int expand_downwards(struct vm_area_struct *vma,

2172

unsigned long address)

2177

unsigned long address)

2173

{

2178

{

2174

int error;

2179

int error;

2175

2180

2176

/*

2181

/*

2177

* We must make sure the anon_vma is allocated

2182

* We must make sure the anon_vma is allocated

2178

* so that the anon_vma locking is not a noop.

2183

* so that the anon_vma locking is not a noop.

2179

*/

2184

*/

2180

if (unlikely(anon_vma_prepare(vma)))

2185

if (unlikely(anon_vma_prepare(vma)))

2181

return -ENOMEM;

2186

return -ENOMEM;

2182

2187

2183

address &= PAGE_MASK;

2188

address &= PAGE_MASK;

2184

error = security_mmap_addr(address);

2189

error = security_mmap_addr(address);

2185

if (error)

2190

if (error)

2186

return error;

2191

return error;

2187

2192

2188

vma_lock_anon_vma(vma);

2193

vma_lock_anon_vma(vma);

2189

2194

2190

/*

2195

/*

2191

* vma->vm_start/vm_end cannot change under us because the caller

2196

* vma->vm_start/vm_end cannot change under us because the caller

2192

* is required to hold the mmap_sem in read mode. We need the

2197

* is required to hold the mmap_sem in read mode. We need the

2193

* anon_vma lock to serialize against concurrent expand_stacks.

2198

* anon_vma lock to serialize against concurrent expand_stacks.

2194

*/

2199

*/

2195

2200

2196

/* Somebody else might have raced and expanded it already */

2201

/* Somebody else might have raced and expanded it already */

2197

if (address < vma->vm_start) {

2202

if (address < vma->vm_start) {

2198

unsigned long size, grow;

2203

unsigned long size, grow;

2199

2204

2200

size = vma->vm_end - address;

2205

size = vma->vm_end - address;

2201

grow = (vma->vm_start - address) >> PAGE_SHIFT;

2206

grow = (vma->vm_start - address) >> PAGE_SHIFT;

2202

2207

2203

error = -ENOMEM;

2208

error = -ENOMEM;

2204

if (grow <= vma->vm_pgoff) {

2209

if (grow <= vma->vm_pgoff) {

2205

error = acct_stack_growth(vma, size, grow);

2210

error = acct_stack_growth(vma, size, grow);

2206

if (!error) {

2211

if (!error) {

2207

/*

2212

/*

2208

* vma_gap_update() doesn't support concurrent

2213

* vma_gap_update() doesn't support concurrent

2209

* updates, but we only hold a shared mmap_sem

2214

* updates, but we only hold a shared mmap_sem

2210

* lock here, so we need to protect against

2215

* lock here, so we need to protect against

2211

* concurrent vma expansions.

2216

* concurrent vma expansions.

2212

* vma_lock_anon_vma() doesn't help here, as

2217

* vma_lock_anon_vma() doesn't help here, as

2213

* we don't guarantee that all growable vmas

2218

* we don't guarantee that all growable vmas

2214

* in a mm share the same root anon vma.

2219

* in a mm share the same root anon vma.

2215

* So, we reuse mm->page_table_lock to guard

2220

* So, we reuse mm->page_table_lock to guard

2216

* against concurrent vma expansions.

2221

* against concurrent vma expansions.

2217

*/

2222

*/

2218

spin_lock(&vma->vm_mm->page_table_lock);

2223

spin_lock(&vma->vm_mm->page_table_lock);

2219

anon_vma_interval_tree_pre_update_vma(vma);

2224

anon_vma_interval_tree_pre_update_vma(vma);

2220

vma->vm_start = address;

2225

vma->vm_start = address;

2221

vma->vm_pgoff -= grow;

2226

vma->vm_pgoff -= grow;

2222

anon_vma_interval_tree_post_update_vma(vma);

2227

anon_vma_interval_tree_post_update_vma(vma);

2223

vma_gap_update(vma);

2228

vma_gap_update(vma);

2224

spin_unlock(&vma->vm_mm->page_table_lock);

2229

spin_unlock(&vma->vm_mm->page_table_lock);

2225

2230

2226

perf_event_mmap(vma);

2231

perf_event_mmap(vma);

2227

}

2232

}

2228

}

2233

}

2229

}

2234

}

2230

vma_unlock_anon_vma(vma);

2235

vma_unlock_anon_vma(vma);

2231

khugepaged_enter_vma_merge(vma);

2236

khugepaged_enter_vma_merge(vma);

2232

validate_mm(vma->vm_mm);

2237

validate_mm(vma->vm_mm);

2233

return error;

2238

return error;

2234

}

2239

}

2235

2240

2236

/*

2241

/*

2237

* Note how expand_stack() refuses to expand the stack all the way to

2242

* Note how expand_stack() refuses to expand the stack all the way to

2238

* abut the next virtual mapping, *unless* that mapping itself is also

2243

* abut the next virtual mapping, *unless* that mapping itself is also

2239

* a stack mapping. We want to leave room for a guard page, after all

2244

* a stack mapping. We want to leave room for a guard page, after all

2240

* (the guard page itself is not added here, that is done by the

2245

* (the guard page itself is not added here, that is done by the

2241

* actual page faulting logic)

2246

* actual page faulting logic)

2242

*

2247

*

2243

* This matches the behavior of the guard page logic (see mm/memory.c:

2248

* This matches the behavior of the guard page logic (see mm/memory.c:

2244

* check_stack_guard_page()), which only allows the guard page to be

2249

* check_stack_guard_page()), which only allows the guard page to be

2245

* removed under these circumstances.

2250

* removed under these circumstances.

2246

*/

2251

*/

2247

#ifdef CONFIG_STACK_GROWSUP

2252

#ifdef CONFIG_STACK_GROWSUP

2248

int expand_stack(struct vm_area_struct *vma, unsigned long address)

2253

int expand_stack(struct vm_area_struct *vma, unsigned long address)

2249

{

2254

{

2250

struct vm_area_struct *next;

2255

struct vm_area_struct *next;

2251

2256

2252

address &= PAGE_MASK;

2257

address &= PAGE_MASK;

2253

next = vma->vm_next;

2258

next = vma->vm_next;

2254

if (next && next->vm_start == address + PAGE_SIZE) {

2259

if (next && next->vm_start == address + PAGE_SIZE) {

2255

if (!(next->vm_flags & VM_GROWSUP))

2260

if (!(next->vm_flags & VM_GROWSUP))

2256

return -ENOMEM;

2261

return -ENOMEM;

2257

}

2262

}

2258

return expand_upwards(vma, address);

2263

return expand_upwards(vma, address);

2259

}

2264

}

2260

2265

2261

struct vm_area_struct *

2266

struct vm_area_struct *

2262

find_extend_vma(struct mm_struct *mm, unsigned long addr)

2267

find_extend_vma(struct mm_struct *mm, unsigned long addr)

2263

{

2268

{

2264

struct vm_area_struct *vma, *prev;

2269

struct vm_area_struct *vma, *prev;

2265

2270

2266

addr &= PAGE_MASK;

2271

addr &= PAGE_MASK;

2267

vma = find_vma_prev(mm, addr, &prev);

2272

vma = find_vma_prev(mm, addr, &prev);

2268

if (vma && (vma->vm_start <= addr))

2273

if (vma && (vma->vm_start <= addr))

2269

return vma;

2274

return vma;

2270

if (!prev || expand_stack(prev, addr))

2275

if (!prev || expand_stack(prev, addr))

2271

return NULL;

2276

return NULL;

2272

if (prev->vm_flags & VM_LOCKED)

2277

if (prev->vm_flags & VM_LOCKED)

2273

__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);

2278

__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);

2274

return prev;

2279

return prev;

2275

}

2280

}

2276

#else

2281

#else

2277

int expand_stack(struct vm_area_struct *vma, unsigned long address)

2282

int expand_stack(struct vm_area_struct *vma, unsigned long address)

2278

{

2283

{

2279

struct vm_area_struct *prev;

2284

struct vm_area_struct *prev;

2280

2285

2281

address &= PAGE_MASK;

2286

address &= PAGE_MASK;

2282

prev = vma->vm_prev;

2287

prev = vma->vm_prev;

2283

if (prev && prev->vm_end == address) {

2288

if (prev && prev->vm_end == address) {

2284

if (!(prev->vm_flags & VM_GROWSDOWN))

2289

if (!(prev->vm_flags & VM_GROWSDOWN))

2285

return -ENOMEM;

2290

return -ENOMEM;

2286

}

2291

}

2287

return expand_downwards(vma, address);

2292

return expand_downwards(vma, address);

2288

}

2293

}

2289

2294

2290

struct vm_area_struct *

2295

struct vm_area_struct *

2291

find_extend_vma(struct mm_struct * mm, unsigned long addr)

2296

find_extend_vma(struct mm_struct * mm, unsigned long addr)

2292

{

2297

{

2293

struct vm_area_struct * vma;

2298

struct vm_area_struct * vma;

2294

unsigned long start;

2299

unsigned long start;

2295

2300

2296

addr &= PAGE_MASK;

2301

addr &= PAGE_MASK;

2297

vma = find_vma(mm,addr);

2302

vma = find_vma(mm,addr);

2298

if (!vma)

2303

if (!vma)

2299

return NULL;

2304

return NULL;

2300

if (vma->vm_start <= addr)

2305

if (vma->vm_start <= addr)

2301

return vma;

2306

return vma;

2302

if (!(vma->vm_flags & VM_GROWSDOWN))

2307

if (!(vma->vm_flags & VM_GROWSDOWN))

2303

return NULL;

2308

return NULL;

2304

start = vma->vm_start;

2309

start = vma->vm_start;

2305

if (expand_stack(vma, addr))

2310

if (expand_stack(vma, addr))

2306

return NULL;

2311

return NULL;

2307

if (vma->vm_flags & VM_LOCKED)

2312

if (vma->vm_flags & VM_LOCKED)

2308

__mlock_vma_pages_range(vma, addr, start, NULL);

2313

__mlock_vma_pages_range(vma, addr, start, NULL);

2309

return vma;

2314

return vma;

2310

}

2315

}

2311

#endif

2316

#endif

2312

2317

2313

/*

2318

/*

2314

* Ok - we have the memory areas we should free on the vma list,

2319

* Ok - we have the memory areas we should free on the vma list,

2315

* so release them, and do the vma updates.

2320

* so release them, and do the vma updates.

2316

*

2321

*

2317

* Called with the mm semaphore held.

2322

* Called with the mm semaphore held.

2318

*/

2323

*/

2319

static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)

2324

static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)

2320

{

2325

{

2321

unsigned long nr_accounted = 0;

2326

unsigned long nr_accounted = 0;

2322

2327

2323

/* Update high watermark before we lower total_vm */

2328

/* Update high watermark before we lower total_vm */

2324

update_hiwater_vm(mm);

2329

update_hiwater_vm(mm);

2325

do {

2330

do {

2326

long nrpages = vma_pages(vma);

2331

long nrpages = vma_pages(vma);

2327

2332

2328

if (vma->vm_flags & VM_ACCOUNT)

2333

if (vma->vm_flags & VM_ACCOUNT)

2329

nr_accounted += nrpages;

2334

nr_accounted += nrpages;

2330

vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);

2335

vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);

2331

vma = remove_vma(vma);

2336

vma = remove_vma(vma);

2332

} while (vma);

2337

} while (vma);

2333

vm_unacct_memory(nr_accounted);

2338

vm_unacct_memory(nr_accounted);

2334

validate_mm(mm);

2339

validate_mm(mm);

2335

}

2340

}

2336

2341

2337

/*

2342

/*

2338

* Get rid of page table information in the indicated region.

2343

* Get rid of page table information in the indicated region.

2339

*

2344

*

2340

* Called with the mm semaphore held.

2345

* Called with the mm semaphore held.

2341

*/

2346

*/

2342

static void unmap_region(struct mm_struct *mm,

2347

static void unmap_region(struct mm_struct *mm,

2343

struct vm_area_struct *vma, struct vm_area_struct *prev,

2348

struct vm_area_struct *vma, struct vm_area_struct *prev,

2344

unsigned long start, unsigned long end)

2349

unsigned long start, unsigned long end)

2345

{

2350

{

2346

struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;

2351

struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;

2347

struct mmu_gather tlb;

2352

struct mmu_gather tlb;

2348

2353

2349

lru_add_drain();

2354

lru_add_drain();

2350

tlb_gather_mmu(&tlb, mm, 0);

2355

tlb_gather_mmu(&tlb, mm, 0);

2351

update_hiwater_rss(mm);

2356

update_hiwater_rss(mm);

2352

unmap_vmas(&tlb, vma, start, end);

2357

unmap_vmas(&tlb, vma, start, end);

2353

free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,

2358

free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,

2354

next ? next->vm_start : USER_PGTABLES_CEILING);

2359

next ? next->vm_start : USER_PGTABLES_CEILING);

2355

tlb_finish_mmu(&tlb, start, end);

2360

tlb_finish_mmu(&tlb, start, end);

2356

}

2361

}

2357

2362

2358

/*

2363

/*

2359

* Create a list of vma's touched by the unmap, removing them from the mm's

2364

* Create a list of vma's touched by the unmap, removing them from the mm's

2360

* vma list as we go..

2365

* vma list as we go..

2361

*/

2366

*/

2362

static void

2367

static void

2363

detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,

2368

detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,

2364

struct vm_area_struct *prev, unsigned long end)

2369

struct vm_area_struct *prev, unsigned long end)

2365

{

2370

{

2366

struct vm_area_struct **insertion_point;

2371

struct vm_area_struct **insertion_point;

2367

struct vm_area_struct *tail_vma = NULL;

2372

struct vm_area_struct *tail_vma = NULL;

2368

unsigned long addr;

2373

unsigned long addr;

2369

2374

2370

insertion_point = (prev ? &prev->vm_next : &mm->mmap);

2375

insertion_point = (prev ? &prev->vm_next : &mm->mmap);

2371

vma->vm_prev = NULL;

2376

vma->vm_prev = NULL;

2372

do {

2377

do {

2373

vma_rb_erase(vma, &mm->mm_rb);

2378

vma_rb_erase(vma, &mm->mm_rb);

2374

mm->map_count--;

2379

mm->map_count--;

2375

tail_vma = vma;

2380

tail_vma = vma;

2376

vma = vma->vm_next;

2381

vma = vma->vm_next;

2377

} while (vma && vma->vm_start < end);

2382

} while (vma && vma->vm_start < end);

2378

*insertion_point = vma;

2383

*insertion_point = vma;

2379

if (vma) {

2384

if (vma) {

2380

vma->vm_prev = prev;

2385

vma->vm_prev = prev;

2381

vma_gap_update(vma);

2386

vma_gap_update(vma);

2382

} else

2387

} else

2383

mm->highest_vm_end = prev ? prev->vm_end : 0;

2388

mm->highest_vm_end = prev ? prev->vm_end : 0;

2384

tail_vma->vm_next = NULL;

2389

tail_vma->vm_next = NULL;

2385

if (mm->unmap_area == arch_unmap_area)

2390

if (mm->unmap_area == arch_unmap_area)

2386

addr = prev ? prev->vm_end : mm->mmap_base;

2391

addr = prev ? prev->vm_end : mm->mmap_base;

2387

else

2392

else

2388

addr = vma ? vma->vm_start : mm->mmap_base;

2393

addr = vma ? vma->vm_start : mm->mmap_base;

2389

mm->unmap_area(mm, addr);

2394

mm->unmap_area(mm, addr);

2390

mm->mmap_cache = NULL; /* Kill the cache. */

2395

mm->mmap_cache = NULL; /* Kill the cache. */

2391

}

2396

}

2392

2397

2393

/*

2398

/*

2394

* __split_vma() bypasses sysctl_max_map_count checking. We use this on the

2399

* __split_vma() bypasses sysctl_max_map_count checking. We use this on the

2395

* munmap path where it doesn't make sense to fail.

2400

* munmap path where it doesn't make sense to fail.

2396

*/

2401

*/

2397

static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,

2402

static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,

2398

unsigned long addr, int new_below)

2403

unsigned long addr, int new_below)

2399

{

2404

{

2400

struct mempolicy *pol;

2405

struct mempolicy *pol;

2401

struct vm_area_struct *new;

2406

struct vm_area_struct *new;

2402

int err = -ENOMEM;

2407

int err = -ENOMEM;

2403

2408

2404

if (is_vm_hugetlb_page(vma) && (addr &

2409

if (is_vm_hugetlb_page(vma) && (addr &

2405

~(huge_page_mask(hstate_vma(vma)))))

2410

~(huge_page_mask(hstate_vma(vma)))))

2406

return -EINVAL;

2411

return -EINVAL;

2407

2412

2408

new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

2413

new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

2409

if (!new)

2414

if (!new)

2410

goto out_err;

2415

goto out_err;

2411

2416

2412

/* most fields are the same, copy all, and then fixup */

2417

/* most fields are the same, copy all, and then fixup */

2413

*new = *vma;

2418

*new = *vma;

2414

2419

2415

INIT_LIST_HEAD(&new->anon_vma_chain);

2420

INIT_LIST_HEAD(&new->anon_vma_chain);

2416

2421

2417

if (new_below)

2422

if (new_below)

2418

new->vm_end = addr;

2423

new->vm_end = addr;

2419

else {

2424

else {

2420

new->vm_start = addr;

2425

new->vm_start = addr;

2421

new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);

2426

new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);

2422

}

2427

}

2423

2428

2424

pol = mpol_dup(vma_policy(vma));

2429

pol = mpol_dup(vma_policy(vma));

2425

if (IS_ERR(pol)) {

2430

if (IS_ERR(pol)) {

2426

err = PTR_ERR(pol);

2431

err = PTR_ERR(pol);

2427

goto out_free_vma;

2432

goto out_free_vma;

2428

}

2433

}

2429

vma_set_policy(new, pol);

2434

vma_set_policy(new, pol);

2430

2435

2431

if (anon_vma_clone(new, vma))

2436

if (anon_vma_clone(new, vma))

2432

goto out_free_mpol;

2437

goto out_free_mpol;

2433

2438

2434

if (new->vm_file)

2439

if (new->vm_file)

2435

get_file(new->vm_file);

2440

get_file(new->vm_file);

2436

2441

2437

if (new->vm_ops && new->vm_ops->open)

2442

if (new->vm_ops && new->vm_ops->open)

2438

new->vm_ops->open(new);

2443

new->vm_ops->open(new);

2439

2444

2440

if (new_below)

2445

if (new_below)

2441

err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +

2446

err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +

2442

((addr - new->vm_start) >> PAGE_SHIFT), new);

2447

((addr - new->vm_start) >> PAGE_SHIFT), new);

2443

else

2448

else

2444

err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

2449

err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

2445

2450

2446

/* Success. */

2451

/* Success. */

2447

if (!err)

2452

if (!err)

2448

return 0;

2453

return 0;

2449

2454

2450

/* Clean everything up if vma_adjust failed. */

2455

/* Clean everything up if vma_adjust failed. */

2451

if (new->vm_ops && new->vm_ops->close)

2456

if (new->vm_ops && new->vm_ops->close)

2452

new->vm_ops->close(new);

2457

new->vm_ops->close(new);

2453

if (new->vm_file)

2458

if (new->vm_file)

2454

fput(new->vm_file);

2459

fput(new->vm_file);

2455

unlink_anon_vmas(new);

2460

unlink_anon_vmas(new);

2456

out_free_mpol:

2461

out_free_mpol:

2457

mpol_put(pol);

2462

mpol_put(pol);

2458

out_free_vma:

2463

out_free_vma:

2459

kmem_cache_free(vm_area_cachep, new);

2464

kmem_cache_free(vm_area_cachep, new);

2460

out_err:

2465

out_err:

2461

return err;

2466

return err;

2462

}

2467

}

2463

2468

2464

/*

2469

/*

2465

* Split a vma into two pieces at address 'addr', a new vma is allocated

2470

* Split a vma into two pieces at address 'addr', a new vma is allocated

2466

* either for the first part or the tail.

2471

* either for the first part or the tail.

2467

*/

2472

*/

2468

int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,

2473

int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,

2469

unsigned long addr, int new_below)

2474

unsigned long addr, int new_below)

2470

{

2475

{

2471

if (mm->map_count >= sysctl_max_map_count)

2476

if (mm->map_count >= sysctl_max_map_count)

2472

return -ENOMEM;

2477

return -ENOMEM;

2473

2478

2474

return __split_vma(mm, vma, addr, new_below);

2479

return __split_vma(mm, vma, addr, new_below);

2475

}

2480

}

2476

2481

2477

/* Munmap is split into 2 main parts -- this part which finds

2482

/* Munmap is split into 2 main parts -- this part which finds

2478

* what needs doing, and the areas themselves, which do the

2483

* what needs doing, and the areas themselves, which do the

2479

* work. This now handles partial unmappings.

2484

* work. This now handles partial unmappings.

2480

* Jeremy Fitzhardinge <jeremy@goop.org>

2485

* Jeremy Fitzhardinge <jeremy@goop.org>

2481

*/

2486

*/

2482

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

2487

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

2483

{

2488

{

2484

unsigned long end;

2489

unsigned long end;

2485

struct vm_area_struct *vma, *prev, *last;

2490

struct vm_area_struct *vma, *prev, *last;

2486

2491

2487

if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)

2492

if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)

2488

return -EINVAL;

2493

return -EINVAL;

2489

2494

2490

if ((len = PAGE_ALIGN(len)) == 0)

2495

if ((len = PAGE_ALIGN(len)) == 0)

2491

return -EINVAL;

2496

return -EINVAL;

2492

2497

2493

/* Find the first overlapping VMA */

2498

/* Find the first overlapping VMA */

2494

vma = find_vma(mm, start);

2499

vma = find_vma(mm, start);

2495

if (!vma)

2500

if (!vma)

2496

return 0;

2501

return 0;

2497

prev = vma->vm_prev;

2502

prev = vma->vm_prev;

2498

/* we have start < vma->vm_end */

2503

/* we have start < vma->vm_end */

2499

2504

2500

/* if it doesn't overlap, we have nothing.. */

2505

/* if it doesn't overlap, we have nothing.. */

2501

end = start + len;

2506

end = start + len;

2502

if (vma->vm_start >= end)

2507

if (vma->vm_start >= end)

2503

return 0;

2508

return 0;

2504

2509

2505

/*

2510

/*

2506

* If we need to split any vma, do it now to save pain later.

2511

* If we need to split any vma, do it now to save pain later.

2507

*

2512

*

2508

* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially

2513

* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially

2509

* unmapped vm_area_struct will remain in use: so lower split_vma

2514

* unmapped vm_area_struct will remain in use: so lower split_vma

2510

* places tmp vma above, and higher split_vma places tmp vma below.

2515

* places tmp vma above, and higher split_vma places tmp vma below.

2511

*/

2516

*/

2512

if (start > vma->vm_start) {

2517

if (start > vma->vm_start) {

2513

int error;

2518

int error;

2514

2519

2515

/*

2520

/*

2516

* Make sure that map_count on return from munmap() will

2521

* Make sure that map_count on return from munmap() will

2517

* not exceed its limit; but let map_count go just above

2522

* not exceed its limit; but let map_count go just above

2518

* its limit temporarily, to help free resources as expected.

2523

* its limit temporarily, to help free resources as expected.

2519

*/

2524

*/

2520

if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)

2525

if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)

2521

return -ENOMEM;

2526

return -ENOMEM;

2522

2527

2523

error = __split_vma(mm, vma, start, 0);

2528

error = __split_vma(mm, vma, start, 0);

2524

if (error)

2529

if (error)

2525

return error;

2530

return error;

2526

prev = vma;

2531

prev = vma;

2527

}

2532

}

2528

2533

2529

/* Does it split the last one? */

2534

/* Does it split the last one? */

2530

last = find_vma(mm, end);

2535

last = find_vma(mm, end);

2531

if (last && end > last->vm_start) {

2536

if (last && end > last->vm_start) {

2532

int error = __split_vma(mm, last, end, 1);

2537

int error = __split_vma(mm, last, end, 1);

2533

if (error)

2538

if (error)

2534

return error;

2539

return error;

2535

}

2540

}

2536

vma = prev? prev->vm_next: mm->mmap;

2541

vma = prev? prev->vm_next: mm->mmap;

2537

2542

2538

/*

2543

/*

2539

* unlock any mlock()ed ranges before detaching vmas

2544

* unlock any mlock()ed ranges before detaching vmas

2540

*/

2545

*/

2541

if (mm->locked_vm) {

2546

if (mm->locked_vm) {

2542

struct vm_area_struct *tmp = vma;

2547

struct vm_area_struct *tmp = vma;

2543

while (tmp && tmp->vm_start < end) {

2548

while (tmp && tmp->vm_start < end) {

2544

if (tmp->vm_flags & VM_LOCKED) {

2549

if (tmp->vm_flags & VM_LOCKED) {

2545

mm->locked_vm -= vma_pages(tmp);

2550

mm->locked_vm -= vma_pages(tmp);

2546

munlock_vma_pages_all(tmp);

2551

munlock_vma_pages_all(tmp);

2547

}

2552

}

2548

tmp = tmp->vm_next;

2553

tmp = tmp->vm_next;

2549

}

2554

}

2550

}

2555

}

2551

2556

2552

/*

2557

/*

2553

* Remove the vma's, and unmap the actual pages

2558

* Remove the vma's, and unmap the actual pages

2554

*/

2559

*/

2555

detach_vmas_to_be_unmapped(mm, vma, prev, end);

2560

detach_vmas_to_be_unmapped(mm, vma, prev, end);

2556

unmap_region(mm, vma, prev, start, end);

2561

unmap_region(mm, vma, prev, start, end);

2557

2562

2558

/* Fix up all other VM information */

2563

/* Fix up all other VM information */

2559

remove_vma_list(mm, vma);

2564

remove_vma_list(mm, vma);

2560

2565

2561

return 0;

2566

return 0;

2562

}

2567

}

2563

2568

2564

int vm_munmap(unsigned long start, size_t len)

2569

int vm_munmap(unsigned long start, size_t len)

2565

{

2570

{

2566

int ret;

2571

int ret;

2567

struct mm_struct *mm = current->mm;

2572

struct mm_struct *mm = current->mm;

2568

2573

2569

down_write(&mm->mmap_sem);

2574

down_write(&mm->mmap_sem);

2570

ret = do_munmap(mm, start, len);

2575

ret = do_munmap(mm, start, len);

2571

up_write(&mm->mmap_sem);

2576

up_write(&mm->mmap_sem);

2572

return ret;

2577

return ret;

2573

}

2578

}

2574

EXPORT_SYMBOL(vm_munmap);

2579

EXPORT_SYMBOL(vm_munmap);

2575

2580

2576

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)

2581

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)

2577

{

2582

{

2578

profile_munmap(addr);

2583

profile_munmap(addr);

2579

return vm_munmap(addr, len);

2584

return vm_munmap(addr, len);

2580

}

2585

}

2581

2586

2582

static inline void verify_mm_writelocked(struct mm_struct *mm)

2587

static inline void verify_mm_writelocked(struct mm_struct *mm)

2583

{

2588

{

2584

#ifdef CONFIG_DEBUG_VM

2589

#ifdef CONFIG_DEBUG_VM

2585

if (unlikely(down_read_trylock(&mm->mmap_sem))) {

2590

if (unlikely(down_read_trylock(&mm->mmap_sem))) {

2586

WARN_ON(1);

2591

WARN_ON(1);

2587

up_read(&mm->mmap_sem);

2592

up_read(&mm->mmap_sem);

2588

}

2593

}

2589

#endif

2594

#endif

2590

}

2595

}

2591

2596

2592

/*

2597

/*

2593

* this is really a simplified "do_mmap". it only handles

2598

* this is really a simplified "do_mmap". it only handles

2594

* anonymous maps. eventually we may be able to do some

2599

* anonymous maps. eventually we may be able to do some

2595

* brk-specific accounting here.

2600

* brk-specific accounting here.

2596

*/

2601

*/

2597

static unsigned long do_brk(unsigned long addr, unsigned long len)

2602

static unsigned long do_brk(unsigned long addr, unsigned long len)

2598

{

2603

{

2599

struct mm_struct * mm = current->mm;

2604

struct mm_struct * mm = current->mm;

2600

struct vm_area_struct * vma, * prev;

2605

struct vm_area_struct * vma, * prev;

2601

unsigned long flags;

2606

unsigned long flags;

2602

struct rb_node ** rb_link, * rb_parent;

2607

struct rb_node ** rb_link, * rb_parent;

2603

pgoff_t pgoff = addr >> PAGE_SHIFT;

2608

pgoff_t pgoff = addr >> PAGE_SHIFT;

2604

int error;

2609

int error;

2605

2610

2606

len = PAGE_ALIGN(len);

2611

len = PAGE_ALIGN(len);

2607

if (!len)

2612

if (!len)

2608

return addr;

2613

return addr;

2609

2614

2610

flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

2615

flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

2611

2616

2612

error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);

2617

error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);

2613

if (error & ~PAGE_MASK)

2618

if (error & ~PAGE_MASK)

2614

return error;

2619

return error;

2615

2620

2616

/*

2621

/*

2617

* mlock MCL_FUTURE?

2622

* mlock MCL_FUTURE?

2618

*/

2623

*/

2619

if (mm->def_flags & VM_LOCKED) {

2624

if (mm->def_flags & VM_LOCKED) {

2620

unsigned long locked, lock_limit;

2625

unsigned long locked, lock_limit;

2621

locked = len >> PAGE_SHIFT;

2626

locked = len >> PAGE_SHIFT;

2622

locked += mm->locked_vm;

2627

locked += mm->locked_vm;

2623

lock_limit = rlimit(RLIMIT_MEMLOCK);

2628

lock_limit = rlimit(RLIMIT_MEMLOCK);

2624

lock_limit >>= PAGE_SHIFT;

2629

lock_limit >>= PAGE_SHIFT;

2625

if (locked > lock_limit && !capable(CAP_IPC_LOCK))

2630

if (locked > lock_limit && !capable(CAP_IPC_LOCK))

2626

return -EAGAIN;

2631

return -EAGAIN;

2627

}

2632

}

2628

2633

2629

/*

2634

/*

2630

* mm->mmap_sem is required to protect against another thread

2635

* mm->mmap_sem is required to protect against another thread

2631

* changing the mappings in case we sleep.

2636

* changing the mappings in case we sleep.

2632

*/

2637

*/

2633

verify_mm_writelocked(mm);

2638

verify_mm_writelocked(mm);

2634

2639

2635

/*

2640

/*

2636

* Clear old maps. this also does some error checking for us

2641

* Clear old maps. this also does some error checking for us

2637

*/

2642

*/

2638

munmap_back:

2643

munmap_back:

2639

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {

2644

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {

2640

if (do_munmap(mm, addr, len))

2645

if (do_munmap(mm, addr, len))

2641

return -ENOMEM;

2646

return -ENOMEM;

2642

goto munmap_back;

2647

goto munmap_back;

2643

}

2648

}

2644

2649

2645

/* Check against address space limits *after* clearing old maps... */

2650

/* Check against address space limits *after* clearing old maps... */

2646

if (!may_expand_vm(mm, len >> PAGE_SHIFT))

2651

if (!may_expand_vm(mm, len >> PAGE_SHIFT))

2647

return -ENOMEM;

2652

return -ENOMEM;

2648

2653

2649

if (mm->map_count > sysctl_max_map_count)

2654

if (mm->map_count > sysctl_max_map_count)

2650

return -ENOMEM;

2655

return -ENOMEM;

2651

2656

2652

if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))

2657

if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))

2653

return -ENOMEM;

2658

return -ENOMEM;

2654

2659

2655

/* Can we just expand an old private anonymous mapping? */

2660

/* Can we just expand an old private anonymous mapping? */

2656

vma = vma_merge(mm, prev, addr, addr + len, flags,

2661

vma = vma_merge(mm, prev, addr, addr + len, flags,

2657

NULL, NULL, pgoff, NULL);

2662

NULL, NULL, pgoff, NULL);

2658

if (vma)

2663

if (vma)

2659

goto out;

2664

goto out;

2660

2665

2661

/*

2666

/*

2662

* create a vma struct for an anonymous mapping

2667

* create a vma struct for an anonymous mapping

2663

*/

2668

*/

2664

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

2669

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

2665

if (!vma) {

2670

if (!vma) {

2666

vm_unacct_memory(len >> PAGE_SHIFT);

2671

vm_unacct_memory(len >> PAGE_SHIFT);

2667

return -ENOMEM;

2672

return -ENOMEM;

2668

}

2673

}

2669

2674

2670

INIT_LIST_HEAD(&vma->anon_vma_chain);

2675

INIT_LIST_HEAD(&vma->anon_vma_chain);

2671

vma->vm_mm = mm;

2676

vma->vm_mm = mm;

2672

vma->vm_start = addr;

2677

vma->vm_start = addr;

2673

vma->vm_end = addr + len;

2678

vma->vm_end = addr + len;

2674

vma->vm_pgoff = pgoff;

2679

vma->vm_pgoff = pgoff;

2675

vma->vm_flags = flags;

2680

vma->vm_flags = flags;

2676

vma->vm_page_prot = vm_get_page_prot(flags);

2681

vma->vm_page_prot = vm_get_page_prot(flags);

2677

vma_link(mm, vma, prev, rb_link, rb_parent);

2682

vma_link(mm, vma, prev, rb_link, rb_parent);

2678

out:

2683

out:

2679

perf_event_mmap(vma);

2684

perf_event_mmap(vma);

2680

mm->total_vm += len >> PAGE_SHIFT;

2685

mm->total_vm += len >> PAGE_SHIFT;

2681

if (flags & VM_LOCKED)

2686

if (flags & VM_LOCKED)

2682

mm->locked_vm += (len >> PAGE_SHIFT);

2687

mm->locked_vm += (len >> PAGE_SHIFT);

2683

return addr;

2688

return addr;

2684

}

2689

}

2685

2690

2686

unsigned long vm_brk(unsigned long addr, unsigned long len)

2691

unsigned long vm_brk(unsigned long addr, unsigned long len)

2687

{

2692

{

2688

struct mm_struct *mm = current->mm;

2693

struct mm_struct *mm = current->mm;

2689

unsigned long ret;

2694

unsigned long ret;

2690

bool populate;

2695

bool populate;

2691

2696

2692

down_write(&mm->mmap_sem);

2697

down_write(&mm->mmap_sem);

2693

ret = do_brk(addr, len);

2698

ret = do_brk(addr, len);

2694

populate = ((mm->def_flags & VM_LOCKED) != 0);

2699

populate = ((mm->def_flags & VM_LOCKED) != 0);

2695

up_write(&mm->mmap_sem);

2700

up_write(&mm->mmap_sem);

2696

if (populate)

2701

if (populate)

2697

mm_populate(addr, len);

2702

mm_populate(addr, len);

2698

return ret;

2703

return ret;

2699

}

2704

}

2700

EXPORT_SYMBOL(vm_brk);

2705

EXPORT_SYMBOL(vm_brk);

2701

2706

2702

/* Release all mmaps. */

2707

/* Release all mmaps. */

2703

void exit_mmap(struct mm_struct *mm)

2708

void exit_mmap(struct mm_struct *mm)

2704

{

2709

{

2705

struct mmu_gather tlb;

2710

struct mmu_gather tlb;

2706

struct vm_area_struct *vma;

2711

struct vm_area_struct *vma;

2707

unsigned long nr_accounted = 0;

2712

unsigned long nr_accounted = 0;

2708

2713

2709

/* mm's last user has gone, and its about to be pulled down */

2714

/* mm's last user has gone, and its about to be pulled down */

2710

mmu_notifier_release(mm);

2715

mmu_notifier_release(mm);

2711

2716

2712

if (mm->locked_vm) {

2717

if (mm->locked_vm) {

2713

vma = mm->mmap;

2718

vma = mm->mmap;

2714

while (vma) {

2719

while (vma) {

2715

if (vma->vm_flags & VM_LOCKED)

2720

if (vma->vm_flags & VM_LOCKED)

2716

munlock_vma_pages_all(vma);

2721

munlock_vma_pages_all(vma);

2717

vma = vma->vm_next;

2722

vma = vma->vm_next;

2718

}

2723

}

2719

}

2724

}

2720

2725

2721

arch_exit_mmap(mm);

2726

arch_exit_mmap(mm);

2722

2727

2723

vma = mm->mmap;

2728

vma = mm->mmap;

2724

if (!vma) /* Can happen if dup_mmap() received an OOM */

2729

if (!vma) /* Can happen if dup_mmap() received an OOM */

2725

return;

2730

return;

2726

2731

2727

lru_add_drain();

2732

lru_add_drain();

2728

flush_cache_mm(mm);

2733

flush_cache_mm(mm);

2729

tlb_gather_mmu(&tlb, mm, 1);

2734

tlb_gather_mmu(&tlb, mm, 1);

2730

/* update_hiwater_rss(mm) here? but nobody should be looking */

2735

/* update_hiwater_rss(mm) here? but nobody should be looking */

2731

/* Use -1 here to ensure all VMAs in the mm are unmapped */

2736

/* Use -1 here to ensure all VMAs in the mm are unmapped */

2732

unmap_vmas(&tlb, vma, 0, -1);

2737

unmap_vmas(&tlb, vma, 0, -1);

2733

2738

2734

free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);

2739

free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);

2735

tlb_finish_mmu(&tlb, 0, -1);

2740

tlb_finish_mmu(&tlb, 0, -1);

2736

2741

2737

/*

2742

/*

2738

* Walk the list again, actually closing and freeing it,

2743

* Walk the list again, actually closing and freeing it,

2739

* with preemption enabled, without holding any MM locks.

2744

* with preemption enabled, without holding any MM locks.

2740

*/

2745

*/

2741

while (vma) {

2746

while (vma) {

2742

if (vma->vm_flags & VM_ACCOUNT)

2747

if (vma->vm_flags & VM_ACCOUNT)

2743

nr_accounted += vma_pages(vma);

2748

nr_accounted += vma_pages(vma);

2744

vma = remove_vma(vma);

2749

vma = remove_vma(vma);

2745

}

2750

}

2746

vm_unacct_memory(nr_accounted);

2751

vm_unacct_memory(nr_accounted);

2747

2752

2748

WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);

2753

WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);

2749

}

2754

}

2750

2755

2751

/* Insert vm structure into process list sorted by address

2756

/* Insert vm structure into process list sorted by address

2752

* and into the inode's i_mmap tree. If vm_file is non-NULL

2757

* and into the inode's i_mmap tree. If vm_file is non-NULL

2753

* then i_mmap_mutex is taken here.

2758

* then i_mmap_mutex is taken here.

2754

*/

2759

*/

2755

int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)

2760

int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)

2756

{

2761

{

2757

struct vm_area_struct *prev;

2762

struct vm_area_struct *prev;

2758

struct rb_node **rb_link, *rb_parent;

2763

struct rb_node **rb_link, *rb_parent;

2759

2764

2760

/*

2765

/*

2761

* The vm_pgoff of a purely anonymous vma should be irrelevant

2766

* The vm_pgoff of a purely anonymous vma should be irrelevant

2762

* until its first write fault, when page's anon_vma and index

2767

* until its first write fault, when page's anon_vma and index

2763

* are set. But now set the vm_pgoff it will almost certainly

2768

* are set. But now set the vm_pgoff it will almost certainly

2764

* end up with (unless mremap moves it elsewhere before that

2769

* end up with (unless mremap moves it elsewhere before that

2765

* first wfault), so /proc/pid/maps tells a consistent story.

2770

* first wfault), so /proc/pid/maps tells a consistent story.

2766

*

2771

*

2767

* By setting it to reflect the virtual start address of the

2772

* By setting it to reflect the virtual start address of the

2768

* vma, merges and splits can happen in a seamless way, just

2773

* vma, merges and splits can happen in a seamless way, just

2769

* using the existing file pgoff checks and manipulations.

2774

* using the existing file pgoff checks and manipulations.

2770

* Similarly in do_mmap_pgoff and in do_brk.

2775

* Similarly in do_mmap_pgoff and in do_brk.

2771

*/

2776

*/

2772

if (!vma->vm_file) {

2777

if (!vma->vm_file) {

2773

BUG_ON(vma->anon_vma);

2778

BUG_ON(vma->anon_vma);

2774

vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;

2779

vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;

2775

}

2780

}

2776

if (find_vma_links(mm, vma->vm_start, vma->vm_end,

2781

if (find_vma_links(mm, vma->vm_start, vma->vm_end,

2777

&prev, &rb_link, &rb_parent))

2782

&prev, &rb_link, &rb_parent))

2778

return -ENOMEM;

2783

return -ENOMEM;

2779

if ((vma->vm_flags & VM_ACCOUNT) &&

2784

if ((vma->vm_flags & VM_ACCOUNT) &&

2780

security_vm_enough_memory_mm(mm, vma_pages(vma)))

2785

security_vm_enough_memory_mm(mm, vma_pages(vma)))

2781

return -ENOMEM;

2786

return -ENOMEM;

2782

2787

2783

vma_link(mm, vma, prev, rb_link, rb_parent);

2788

vma_link(mm, vma, prev, rb_link, rb_parent);

2784

return 0;

2789

return 0;

2785

}

2790

}

2786

2791

2787

/*

2792

/*

2788

* Copy the vma structure to a new location in the same mm,

2793

* Copy the vma structure to a new location in the same mm,

2789

* prior to moving page table entries, to effect an mremap move.

2794

* prior to moving page table entries, to effect an mremap move.

2790

*/

2795

*/

2791

struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,

2796

struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,

2792

unsigned long addr, unsigned long len, pgoff_t pgoff,

2797

unsigned long addr, unsigned long len, pgoff_t pgoff,

2793

bool *need_rmap_locks)

2798

bool *need_rmap_locks)

2794

{

2799

{

2795

struct vm_area_struct *vma = *vmap;

2800

struct vm_area_struct *vma = *vmap;

2796

unsigned long vma_start = vma->vm_start;

2801

unsigned long vma_start = vma->vm_start;

2797

struct mm_struct *mm = vma->vm_mm;

2802

struct mm_struct *mm = vma->vm_mm;

2798

struct vm_area_struct *new_vma, *prev;

2803

struct vm_area_struct *new_vma, *prev;

2799

struct rb_node **rb_link, *rb_parent;

2804

struct rb_node **rb_link, *rb_parent;

2800

struct mempolicy *pol;

2805

struct mempolicy *pol;

2801

bool faulted_in_anon_vma = true;

2806

bool faulted_in_anon_vma = true;

2802

2807

2803

/*

2808

/*

2804

* If anonymous vma has not yet been faulted, update new pgoff

2809

* If anonymous vma has not yet been faulted, update new pgoff

2805

* to match new location, to increase its chance of merging.

2810

* to match new location, to increase its chance of merging.

2806

*/

2811

*/

2807

if (unlikely(!vma->vm_file && !vma->anon_vma)) {

2812

if (unlikely(!vma->vm_file && !vma->anon_vma)) {

2808

pgoff = addr >> PAGE_SHIFT;

2813

pgoff = addr >> PAGE_SHIFT;

2809

faulted_in_anon_vma = false;

2814

faulted_in_anon_vma = false;

2810

}

2815

}

2811

2816

2812

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))

2817

if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))

2813

return NULL; /* should never get here */

2818

return NULL; /* should never get here */

2814

new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,

2819

new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,

2815

vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));

2820

vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));

2816

if (new_vma) {

2821

if (new_vma) {

2817

/*

2822

/*

2818

* Source vma may have been merged into new_vma

2823

* Source vma may have been merged into new_vma

2819

*/

2824

*/

2820

if (unlikely(vma_start >= new_vma->vm_start &&

2825

if (unlikely(vma_start >= new_vma->vm_start &&

2821

vma_start < new_vma->vm_end)) {

2826

vma_start < new_vma->vm_end)) {

2822

/*

2827

/*

2823

* The only way we can get a vma_merge with

2828

* The only way we can get a vma_merge with

2824

* self during an mremap is if the vma hasn't

2829

* self during an mremap is if the vma hasn't

2825

* been faulted in yet and we were allowed to

2830

* been faulted in yet and we were allowed to

2826

* reset the dst vma->vm_pgoff to the

2831

* reset the dst vma->vm_pgoff to the

2827

* destination address of the mremap to allow

2832

* destination address of the mremap to allow

2828

* the merge to happen. mremap must change the

2833

* the merge to happen. mremap must change the

2829

* vm_pgoff linearity between src and dst vmas

2834

* vm_pgoff linearity between src and dst vmas

2830

* (in turn preventing a vma_merge) to be

2835

* (in turn preventing a vma_merge) to be

2831

* safe. It is only safe to keep the vm_pgoff

2836

* safe. It is only safe to keep the vm_pgoff

2832

* linear if there are no pages mapped yet.

2837

* linear if there are no pages mapped yet.

2833

*/

2838

*/

2834

VM_BUG_ON(faulted_in_anon_vma);

2839

VM_BUG_ON(faulted_in_anon_vma);

2835

*vmap = vma = new_vma;

2840

*vmap = vma = new_vma;

2836

}

2841

}

2837

*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);

2842

*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);

2838

} else {

2843

} else {

2839

new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

2844

new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

2840

if (new_vma) {

2845

if (new_vma) {

2841

*new_vma = *vma;

2846

*new_vma = *vma;

2842

new_vma->vm_start = addr;

2847

new_vma->vm_start = addr;

2843

new_vma->vm_end = addr + len;

2848

new_vma->vm_end = addr + len;

2844

new_vma->vm_pgoff = pgoff;

2849

new_vma->vm_pgoff = pgoff;

2845

pol = mpol_dup(vma_policy(vma));

2850

pol = mpol_dup(vma_policy(vma));

2846

if (IS_ERR(pol))

2851

if (IS_ERR(pol))

2847

goto out_free_vma;

2852

goto out_free_vma;

2848

vma_set_policy(new_vma, pol);

2853

vma_set_policy(new_vma, pol);

2849

INIT_LIST_HEAD(&new_vma->anon_vma_chain);

2854

INIT_LIST_HEAD(&new_vma->anon_vma_chain);

2850

if (anon_vma_clone(new_vma, vma))

2855

if (anon_vma_clone(new_vma, vma))

2851

goto out_free_mempol;

2856

goto out_free_mempol;

2852

if (new_vma->vm_file)

2857

if (new_vma->vm_file)

2853

get_file(new_vma->vm_file);

2858

get_file(new_vma->vm_file);

2854

if (new_vma->vm_ops && new_vma->vm_ops->open)

2859

if (new_vma->vm_ops && new_vma->vm_ops->open)

2855

new_vma->vm_ops->open(new_vma);

2860

new_vma->vm_ops->open(new_vma);

2856

vma_link(mm, new_vma, prev, rb_link, rb_parent);

2861

vma_link(mm, new_vma, prev, rb_link, rb_parent);

2857

*need_rmap_locks = false;

2862

*need_rmap_locks = false;

2858

}

2863

}

2859

}

2864

}

2860

return new_vma;

2865

return new_vma;

2861

2866

2862

out_free_mempol:

2867

out_free_mempol:

2863

mpol_put(pol);

2868

mpol_put(pol);

2864

out_free_vma:

2869

out_free_vma:

2865

kmem_cache_free(vm_area_cachep, new_vma);

2870

kmem_cache_free(vm_area_cachep, new_vma);

2866

return NULL;

2871

return NULL;

2867

}

2872

}

2868

2873

2869

/*

2874

/*

2870

* Return true if the calling process may expand its vm space by the passed

2875

* Return true if the calling process may expand its vm space by the passed

2871

* number of pages

2876

* number of pages

2872

*/

2877

*/

2873

int may_expand_vm(struct mm_struct *mm, unsigned long npages)

2878

int may_expand_vm(struct mm_struct *mm, unsigned long npages)

2874

{

2879

{

2875

unsigned long cur = mm->total_vm; /* pages */

2880

unsigned long cur = mm->total_vm; /* pages */

2876

unsigned long lim;

2881

unsigned long lim;

2877

2882

2878

lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;

2883

lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;

2879

2884

2880

if (cur + npages > lim)

2885

if (cur + npages > lim)

2881

return 0;

2886

return 0;

2882

return 1;

2887

return 1;

2883

}

2888

}

2884

2889

2885

2890

2886

static int special_mapping_fault(struct vm_area_struct *vma,

2891

static int special_mapping_fault(struct vm_area_struct *vma,

2887

struct vm_fault *vmf)

2892

struct vm_fault *vmf)

2888

{

2893

{

2889

pgoff_t pgoff;

2894

pgoff_t pgoff;

2890

struct page **pages;

2895

struct page **pages;

2891

2896

2892

/*

2897

/*

2893

* special mappings have no vm_file, and in that case, the mm

2898

* special mappings have no vm_file, and in that case, the mm

2894

* uses vm_pgoff internally. So we have to subtract it from here.

2899

* uses vm_pgoff internally. So we have to subtract it from here.

2895

* We are allowed to do this because we are the mm; do not copy

2900

* We are allowed to do this because we are the mm; do not copy

2896

* this code into drivers!

2901

* this code into drivers!

2897

*/

2902

*/

2898

pgoff = vmf->pgoff - vma->vm_pgoff;

2903

pgoff = vmf->pgoff - vma->vm_pgoff;

2899

2904

2900

for (pages = vma->vm_private_data; pgoff && *pages; ++pages)

2905

for (pages = vma->vm_private_data; pgoff && *pages; ++pages)

2901

pgoff--;

2906

pgoff--;

2902

2907

2903

if (*pages) {

2908

if (*pages) {

2904

struct page *page = *pages;

2909

struct page *page = *pages;

2905

get_page(page);

2910

get_page(page);

2906

vmf->page = page;

2911

vmf->page = page;

2907

return 0;

2912

return 0;

2908

}

2913

}

2909

2914

2910

return VM_FAULT_SIGBUS;

2915

return VM_FAULT_SIGBUS;

2911

}

2916

}

2912

2917

2913

/*

2918

/*

2914

* Having a close hook prevents vma merging regardless of flags.

2919

* Having a close hook prevents vma merging regardless of flags.

2915

*/

2920

*/

2916

static void special_mapping_close(struct vm_area_struct *vma)

2921

static void special_mapping_close(struct vm_area_struct *vma)

2917

{

2922

{

2918

}

2923

}

2919

2924

2920

static const struct vm_operations_struct special_mapping_vmops = {

2925

static const struct vm_operations_struct special_mapping_vmops = {

2921

.close = special_mapping_close,

2926

.close = special_mapping_close,

2922

.fault = special_mapping_fault,

2927

.fault = special_mapping_fault,

2923

};

2928

};

2924

2929

2925

/*

2930

/*

2926

* Called with mm->mmap_sem held for writing.

2931

* Called with mm->mmap_sem held for writing.

2927

* Insert a new vma covering the given region, with the given flags.

2932

* Insert a new vma covering the given region, with the given flags.

2928

* Its pages are supplied by the given array of struct page *.

2933

* Its pages are supplied by the given array of struct page *.

2929

* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.

2934

* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.

2930

* The region past the last page supplied will always produce SIGBUS.

2935

* The region past the last page supplied will always produce SIGBUS.

2931

* The array pointer and the pages it points to are assumed to stay alive

2936

* The array pointer and the pages it points to are assumed to stay alive

2932

* for as long as this mapping might exist.

2937

* for as long as this mapping might exist.

2933

*/

2938

*/

2934

int install_special_mapping(struct mm_struct *mm,

2939

int install_special_mapping(struct mm_struct *mm,

2935

unsigned long addr, unsigned long len,

2940

unsigned long addr, unsigned long len,

2936

unsigned long vm_flags, struct page **pages)

2941

unsigned long vm_flags, struct page **pages)

2937

{

2942

{

2938

int ret;

2943

int ret;

2939

struct vm_area_struct *vma;

2944

struct vm_area_struct *vma;

2940

2945

2941

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

2946

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

2942

if (unlikely(vma == NULL))

2947

if (unlikely(vma == NULL))

2943

return -ENOMEM;

2948

return -ENOMEM;

2944

2949

2945

INIT_LIST_HEAD(&vma->anon_vma_chain);

2950

INIT_LIST_HEAD(&vma->anon_vma_chain);

2946

vma->vm_mm = mm;

2951

vma->vm_mm = mm;

2947

vma->vm_start = addr;

2952

vma->vm_start = addr;

2948

vma->vm_end = addr + len;

2953

vma->vm_end = addr + len;

2949

2954

2950

vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;

2955

vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;

2951

vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

2956

vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

2952

2957

2953

vma->vm_ops = &special_mapping_vmops;

2958

vma->vm_ops = &special_mapping_vmops;

2954

vma->vm_private_data = pages;

2959

vma->vm_private_data = pages;

2955

2960

2956

ret = insert_vm_struct(mm, vma);

2961

ret = insert_vm_struct(mm, vma);

2957

if (ret)

2962

if (ret)

2958

goto out;

2963

goto out;

2959

2964

2960

mm->total_vm += len >> PAGE_SHIFT;

2965

mm->total_vm += len >> PAGE_SHIFT;

2961

2966

2962

perf_event_mmap(vma);

2967

perf_event_mmap(vma);

2963

2968

2964

return 0;

2969

return 0;

2965

2970

2966

out:

2971

out:

2967

kmem_cache_free(vm_area_cachep, vma);

2972

kmem_cache_free(vm_area_cachep, vma);

2968

return ret;

2973

return ret;

2969

}

2974

}

2970

2975

2971

static DEFINE_MUTEX(mm_all_locks_mutex);

2976

static DEFINE_MUTEX(mm_all_locks_mutex);

2972

2977

2973

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)

2978

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)

2974

{

2979

{

2975

if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {

2980

if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {

2976

/*

2981

/*

2977

* The LSB of head.next can't change from under us

2982

* The LSB of head.next can't change from under us

2978

* because we hold the mm_all_locks_mutex.

2983

* because we hold the mm_all_locks_mutex.

2979

*/

2984

*/

2980

down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);

2985

down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);

2981

/*

2986

/*

2982

* We can safely modify head.next after taking the

2987

* We can safely modify head.next after taking the

2983

* anon_vma->root->rwsem. If some other vma in this mm shares

2988

* anon_vma->root->rwsem. If some other vma in this mm shares

2984

* the same anon_vma we won't take it again.

2989

* the same anon_vma we won't take it again.

2985

*

2990

*

2986

* No need of atomic instructions here, head.next

2991

* No need of atomic instructions here, head.next

2987

* can't change from under us thanks to the

2992

* can't change from under us thanks to the

2988

* anon_vma->root->rwsem.

2993

* anon_vma->root->rwsem.

2989

*/

2994

*/

2990

if (__test_and_set_bit(0, (unsigned long *)

2995

if (__test_and_set_bit(0, (unsigned long *)

2991

&anon_vma->root->rb_root.rb_node))

2996

&anon_vma->root->rb_root.rb_node))

2992

BUG();

2997

BUG();

2993

}

2998

}

2994

}

2999

}

2995

3000

2996

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)

3001

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)

2997

{

3002

{

2998

if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {

3003

if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {

2999

/*

3004

/*

3000

* AS_MM_ALL_LOCKS can't change from under us because

3005

* AS_MM_ALL_LOCKS can't change from under us because

3001

* we hold the mm_all_locks_mutex.

3006

* we hold the mm_all_locks_mutex.

3002

*

3007

*

3003

* Operations on ->flags have to be atomic because

3008

* Operations on ->flags have to be atomic because

3004

* even if AS_MM_ALL_LOCKS is stable thanks to the

3009

* even if AS_MM_ALL_LOCKS is stable thanks to the

3005

* mm_all_locks_mutex, there may be other cpus

3010

* mm_all_locks_mutex, there may be other cpus

3006

* changing other bitflags in parallel to us.

3011

* changing other bitflags in parallel to us.

3007

*/

3012

*/

3008

if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))

3013

if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))

3009

BUG();

3014

BUG();

3010

mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);

3015

mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);

3011

}

3016

}

3012

}

3017

}

3013

3018

3014

/*

3019

/*

3015

* This operation locks against the VM for all pte/vma/mm related

3020

* This operation locks against the VM for all pte/vma/mm related

3016

* operations that could ever happen on a certain mm. This includes

3021

* operations that could ever happen on a certain mm. This includes

3017

* vmtruncate, try_to_unmap, and all page faults.

3022

* vmtruncate, try_to_unmap, and all page faults.

3018

*

3023

*

3019

* The caller must take the mmap_sem in write mode before calling

3024

* The caller must take the mmap_sem in write mode before calling

3020

* mm_take_all_locks(). The caller isn't allowed to release the

3025

* mm_take_all_locks(). The caller isn't allowed to release the

3021

* mmap_sem until mm_drop_all_locks() returns.

3026

* mmap_sem until mm_drop_all_locks() returns.

3022

*

3027

*

3023

* mmap_sem in write mode is required in order to block all operations

3028

* mmap_sem in write mode is required in order to block all operations

3024

* that could modify pagetables and free pages without need of

3029

* that could modify pagetables and free pages without need of

3025

* altering the vma layout (for example populate_range() with

3030

* altering the vma layout (for example populate_range() with

3026

* nonlinear vmas). It's also needed in write mode to avoid new

3031

* nonlinear vmas). It's also needed in write mode to avoid new

3027

* anon_vmas to be associated with existing vmas.

3032

* anon_vmas to be associated with existing vmas.

3028

*

3033

*

3029

* A single task can't take more than one mm_take_all_locks() in a row

3034

* A single task can't take more than one mm_take_all_locks() in a row

3030

* or it would deadlock.

3035

* or it would deadlock.

3031

*

3036

*

3032

* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in

3037

* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in

3033

* mapping->flags avoid to take the same lock twice, if more than one

3038

* mapping->flags avoid to take the same lock twice, if more than one

3034

* vma in this mm is backed by the same anon_vma or address_space.

3039

* vma in this mm is backed by the same anon_vma or address_space.

3035

*

3040

*

3036

* We can take all the locks in random order because the VM code

3041

* We can take all the locks in random order because the VM code

3037

* taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never

3042

* taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never

3038

* takes more than one of them in a row. Secondly we're protected

3043

* takes more than one of them in a row. Secondly we're protected

3039

* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.

3044

* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.

3040

*

3045

*

3041

* mm_take_all_locks() and mm_drop_all_locks are expensive operations

3046

* mm_take_all_locks() and mm_drop_all_locks are expensive operations

3042

* that may have to take thousand of locks.

3047

* that may have to take thousand of locks.

3043

*

3048

*

3044

* mm_take_all_locks() can fail if it's interrupted by signals.

3049

* mm_take_all_locks() can fail if it's interrupted by signals.

3045

*/

3050

*/

3046

int mm_take_all_locks(struct mm_struct *mm)

3051

int mm_take_all_locks(struct mm_struct *mm)

3047

{

3052

{

3048

struct vm_area_struct *vma;

3053

struct vm_area_struct *vma;

3049

struct anon_vma_chain *avc;

3054

struct anon_vma_chain *avc;

3050

3055

3051

BUG_ON(down_read_trylock(&mm->mmap_sem));

3056

BUG_ON(down_read_trylock(&mm->mmap_sem));

3052

3057

3053

mutex_lock(&mm_all_locks_mutex);

3058

mutex_lock(&mm_all_locks_mutex);

3054

3059

3055

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3060

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3056

if (signal_pending(current))

3061

if (signal_pending(current))

3057

goto out_unlock;

3062

goto out_unlock;

3058

if (vma->vm_file && vma->vm_file->f_mapping)

3063

if (vma->vm_file && vma->vm_file->f_mapping)

3059

vm_lock_mapping(mm, vma->vm_file->f_mapping);

3064

vm_lock_mapping(mm, vma->vm_file->f_mapping);

3060

}

3065

}

3061

3066

3062

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3067

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3063

if (signal_pending(current))

3068

if (signal_pending(current))

3064

goto out_unlock;

3069

goto out_unlock;

3065

if (vma->anon_vma)

3070

if (vma->anon_vma)

3066

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

3071

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

3067

vm_lock_anon_vma(mm, avc->anon_vma);

3072

vm_lock_anon_vma(mm, avc->anon_vma);

3068

}

3073

}

3069

3074

3070

return 0;

3075

return 0;

3071

3076

3072

out_unlock:

3077

out_unlock:

3073

mm_drop_all_locks(mm);

3078

mm_drop_all_locks(mm);

3074

return -EINTR;

3079

return -EINTR;

3075

}

3080

}

3076

3081

3077

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)

3082

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)

3078

{

3083

{

3079

if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {

3084

if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {

3080

/*

3085

/*

3081

* The LSB of head.next can't change to 0 from under

3086

* The LSB of head.next can't change to 0 from under

3082

* us because we hold the mm_all_locks_mutex.

3087

* us because we hold the mm_all_locks_mutex.

3083

*

3088

*

3084

* We must however clear the bitflag before unlocking

3089

* We must however clear the bitflag before unlocking

3085

* the vma so the users using the anon_vma->rb_root will

3090

* the vma so the users using the anon_vma->rb_root will

3086

* never see our bitflag.

3091

* never see our bitflag.

3087

*

3092

*

3088

* No need of atomic instructions here, head.next

3093

* No need of atomic instructions here, head.next

3089

* can't change from under us until we release the

3094

* can't change from under us until we release the

3090

* anon_vma->root->rwsem.

3095

* anon_vma->root->rwsem.

3091

*/

3096

*/

3092

if (!__test_and_clear_bit(0, (unsigned long *)

3097

if (!__test_and_clear_bit(0, (unsigned long *)

3093

&anon_vma->root->rb_root.rb_node))

3098

&anon_vma->root->rb_root.rb_node))

3094

BUG();

3099

BUG();

3095

anon_vma_unlock_write(anon_vma);

3100

anon_vma_unlock_write(anon_vma);

3096

}

3101

}

3097

}

3102

}

3098

3103

3099

static void vm_unlock_mapping(struct address_space *mapping)

3104

static void vm_unlock_mapping(struct address_space *mapping)

3100

{

3105

{

3101

if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {

3106

if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {

3102

/*

3107

/*

3103

* AS_MM_ALL_LOCKS can't change to 0 from under us

3108

* AS_MM_ALL_LOCKS can't change to 0 from under us

3104

* because we hold the mm_all_locks_mutex.

3109

* because we hold the mm_all_locks_mutex.

3105

*/

3110

*/

3106

mutex_unlock(&mapping->i_mmap_mutex);

3111

mutex_unlock(&mapping->i_mmap_mutex);

3107

if (!test_and_clear_bit(AS_MM_ALL_LOCKS,

3112

if (!test_and_clear_bit(AS_MM_ALL_LOCKS,

3108

&mapping->flags))

3113

&mapping->flags))

3109

BUG();

3114

BUG();

3110

}

3115

}

3111

}

3116

}

3112

3117

3113

/*

3118

/*

3114

* The mmap_sem cannot be released by the caller until

3119

* The mmap_sem cannot be released by the caller until

3115

* mm_drop_all_locks() returns.

3120

* mm_drop_all_locks() returns.

3116

*/

3121

*/

3117

void mm_drop_all_locks(struct mm_struct *mm)

3122

void mm_drop_all_locks(struct mm_struct *mm)

3118

{

3123

{

3119

struct vm_area_struct *vma;

3124

struct vm_area_struct *vma;

3120

struct anon_vma_chain *avc;

3125

struct anon_vma_chain *avc;

3121

3126

3122

BUG_ON(down_read_trylock(&mm->mmap_sem));

3127

BUG_ON(down_read_trylock(&mm->mmap_sem));

3123

BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

3128

BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

3124

3129

3125

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3130

for (vma = mm->mmap; vma; vma = vma->vm_next) {

3126

if (vma->anon_vma)

3131

if (vma->anon_vma)

3127

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

3132

list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)

3128

vm_unlock_anon_vma(avc->anon_vma);

3133

vm_unlock_anon_vma(avc->anon_vma);

3129

if (vma->vm_file && vma->vm_file->f_mapping)

3134

if (vma->vm_file && vma->vm_file->f_mapping)

3130

vm_unlock_mapping(vma->vm_file->f_mapping);

3135

vm_unlock_mapping(vma->vm_file->f_mapping);

3131

}

3136

}

3132

3137

3133

mutex_unlock(&mm_all_locks_mutex);

3138

mutex_unlock(&mm_all_locks_mutex);

3134

}

3139

}

3135

3140

3136

/*

3141

/*

3137

* initialise the VMA slab

3142

* initialise the VMA slab

3138

*/

3143

*/

3139

void __init mmap_init(void)

3144

void __init mmap_init(void)

3140

{

3145

{

3141

int ret;

3146

int ret;

3142

3147

3143

ret = percpu_counter_init(&vm_committed_as, 0);

3148

ret = percpu_counter_init(&vm_committed_as, 0);

3144

VM_BUG_ON(ret);

3149

VM_BUG_ON(ret);

3145

}

3150

}

3146

3151

3147

/*

3152

/*

3148

* Initialise sysctl_user_reserve_kbytes.

3153

* Initialise sysctl_user_reserve_kbytes.

3149

*

3154

*

3150

* This is intended to prevent a user from starting a single memory hogging

3155

* This is intended to prevent a user from starting a single memory hogging

3151

* process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER

3156

* process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER

3152

* mode.

3157

* mode.

3153

*

3158

*

3154

* The default value is min(3% of free memory, 128MB)

3159

* The default value is min(3% of free memory, 128MB)

3155

* 128MB is enough to recover with sshd/login, bash, and top/kill.

3160

* 128MB is enough to recover with sshd/login, bash, and top/kill.

3156

*/

3161

*/

3157

static int init_user_reserve(void)

3162

static int init_user_reserve(void)

3158

{

3163

{

3159

unsigned long free_kbytes;

3164

unsigned long free_kbytes;

3160

3165

3161

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3166

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3162

3167

3163

sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);

3168

sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);

3164

return 0;

3169

return 0;

3165

}

3170

}

3166

module_init(init_user_reserve)

3171

module_init(init_user_reserve)

3167

3172

3168

/*

3173

/*

3169

* Initialise sysctl_admin_reserve_kbytes.

3174

* Initialise sysctl_admin_reserve_kbytes.

3170

*

3175

*

3171

* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin

3176

* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin

3172

* to log in and kill a memory hogging process.

3177

* to log in and kill a memory hogging process.

3173

*

3178

*

3174

* Systems with more than 256MB will reserve 8MB, enough to recover

3179

* Systems with more than 256MB will reserve 8MB, enough to recover

3175

* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will

3180

* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will

3176

* only reserve 3% of free pages by default.

3181

* only reserve 3% of free pages by default.

3177

*/

3182

*/

3178

static int init_admin_reserve(void)

3183

static int init_admin_reserve(void)

3179

{

3184

{

3180

unsigned long free_kbytes;

3185

unsigned long free_kbytes;

3181

3186

3182

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3187

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3183

3188

3184

sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);

3189

sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);

3185

return 0;

3190

return 0;

3186

}

3191

}

3187

module_init(init_admin_reserve)

3192

module_init(init_admin_reserve)

3188

3193

3189

/*

3194

/*

3190

* Reinititalise user and admin reserves if memory is added or removed.

3195

* Reinititalise user and admin reserves if memory is added or removed.

3191

*

3196

*

3192

* The default user reserve max is 128MB, and the default max for the

3197

* The default user reserve max is 128MB, and the default max for the

3193

* admin reserve is 8MB. These are usually, but not always, enough to

3198

* admin reserve is 8MB. These are usually, but not always, enough to

3194

* enable recovery from a memory hogging process using login/sshd, a shell,

3199

* enable recovery from a memory hogging process using login/sshd, a shell,

3195

* and tools like top. It may make sense to increase or even disable the

3200

* and tools like top. It may make sense to increase or even disable the

3196

* reserve depending on the existence of swap or variations in the recovery

3201

* reserve depending on the existence of swap or variations in the recovery

3197

* tools. So, the admin may have changed them.

3202

* tools. So, the admin may have changed them.

3198

*

3203

*

3199

* If memory is added and the reserves have been eliminated or increased above

3204

* If memory is added and the reserves have been eliminated or increased above

3200

* the default max, then we'll trust the admin.

3205

* the default max, then we'll trust the admin.

3201

*

3206

*

3202

* If memory is removed and there isn't enough free memory, then we

3207

* If memory is removed and there isn't enough free memory, then we

3203

* need to reset the reserves.

3208

* need to reset the reserves.

3204

*

3209

*

3205

* Otherwise keep the reserve set by the admin.

3210

* Otherwise keep the reserve set by the admin.

3206

*/

3211

*/

3207

static int reserve_mem_notifier(struct notifier_block *nb,

3212

static int reserve_mem_notifier(struct notifier_block *nb,

3208

unsigned long action, void *data)

3213

unsigned long action, void *data)

3209

{

3214

{

3210

unsigned long tmp, free_kbytes;

3215

unsigned long tmp, free_kbytes;

3211

3216

3212

switch (action) {

3217

switch (action) {

3213

case MEM_ONLINE:

3218

case MEM_ONLINE:

3214

/* Default max is 128MB. Leave alone if modified by operator. */

3219

/* Default max is 128MB. Leave alone if modified by operator. */

3215

tmp = sysctl_user_reserve_kbytes;

3220

tmp = sysctl_user_reserve_kbytes;

3216

if (0 < tmp && tmp < (1UL << 17))

3221

if (0 < tmp && tmp < (1UL << 17))

3217

init_user_reserve();

3222

init_user_reserve();

3218

3223

3219

/* Default max is 8MB. Leave alone if modified by operator. */

3224

/* Default max is 8MB. Leave alone if modified by operator. */

3220

tmp = sysctl_admin_reserve_kbytes;

3225

tmp = sysctl_admin_reserve_kbytes;

3221

if (0 < tmp && tmp < (1UL << 13))

3226

if (0 < tmp && tmp < (1UL << 13))

3222

init_admin_reserve();

3227

init_admin_reserve();

3223

3228

3224

break;

3229

break;

3225

case MEM_OFFLINE:

3230

case MEM_OFFLINE:

3226

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3231

free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

3227

3232

3228

if (sysctl_user_reserve_kbytes > free_kbytes) {

3233

if (sysctl_user_reserve_kbytes > free_kbytes) {

3229

init_user_reserve();

3234

init_user_reserve();

3230

pr_info("vm.user_reserve_kbytes reset to %lu\n",

3235

pr_info("vm.user_reserve_kbytes reset to %lu\n",

3231

sysctl_user_reserve_kbytes);

3236

sysctl_user_reserve_kbytes);

3232

}

3237

}

3233

3238

3234

if (sysctl_admin_reserve_kbytes > free_kbytes) {

3239

if (sysctl_admin_reserve_kbytes > free_kbytes) {

3235

init_admin_reserve();

3240

init_admin_reserve();

3236

pr_info("vm.admin_reserve_kbytes reset to %lu\n",

3241

pr_info("vm.admin_reserve_kbytes reset to %lu\n",

3237

sysctl_admin_reserve_kbytes);

3242

sysctl_admin_reserve_kbytes);

3238

}

3243

}

3239

break;

3244

break;

3240

default:

3245

default:

3241

break;

3246

break;

3242

}

3247

}

3243

return NOTIFY_OK;

3248

return NOTIFY_OK;

3244

}

3249

}

3245

3250

3246

static struct notifier_block reserve_mem_nb = {

3251

static struct notifier_block reserve_mem_nb = {

3247

.notifier_call = reserve_mem_notifier,

3252

.notifier_call = reserve_mem_notifier,

3248

};

3253

};

3249

3254

3250

static int __meminit init_reserve_notifier(void)

3255

static int __meminit init_reserve_notifier(void)

3251

{

3256

{

3252

if (register_hotmemory_notifier(&reserve_mem_nb))

3257

if (register_hotmemory_notifier(&reserve_mem_nb))

3253

printk("Failed registering memory add/remove notifier for admin reserve");

3258

printk("Failed registering memory add/remove notifier for admin reserve");

3254

3259

3255

return 0;

3260

return 0;

3256

}

3261

}

3257

module_init(init_reserve_notifier)

3262

module_init(init_reserve_notifier)

3258

3263

GITLAB

hugetlbfs: fix mmap failure in unaligned size request

 /*
  * hugetlbpage-backed filesystem.  Based on ramfs.
  *
  * Nadia Yvette Chambers, 2002
  *
  * Copyright (C) 2002 Linus Torvalds.
  */
 #include <linux/module.h>
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>		/* remove ASAP */
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
 #include <linux/parser.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
 #include <linux/migrate.h>
 #include <asm/uaccess.h>
 static const struct super_operations hugetlbfs_ops;
 static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
 static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 struct hugetlbfs_config {
 	kuid_t   uid;
 	kgid_t   gid;
 	umode_t mode;
 	long	nr_blocks;
 	long	nr_inodes;
 	struct hstate *hstate;
 };
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
 	struct inode vfs_inode;
 };
 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 {
 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
 	.name		= "hugetlbfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 int sysctl_hugetlb_shm_group;
 enum {
 	Opt_size, Opt_nr_inodes,
 	Opt_mode, Opt_uid, Opt_gid,
 	Opt_pagesize,
 	Opt_err,
 };
 static const match_table_t tokens = {
 	{Opt_size,	"size=%s"},
 	{Opt_nr_inodes,	"nr_inodes=%s"},
 	{Opt_mode,	"mode=%o"},
 	{Opt_uid,	"uid=%u"},
 	{Opt_gid,	"gid=%u"},
 	{Opt_pagesize,	"pagesize=%s"},
 	{Opt_err,	NULL},
 };
 static void huge_pagevec_release(struct pagevec *pvec)
 {
 	int i;
 	for (i = 0; i < pagevec_count(pvec); ++i)
 		put_page(pvec->pages[i]);
 	pagevec_reinit(pvec);
 }
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
 	 * already been checked by prepare_hugepage_range.  If you add
 	 * any error returns here, do so after setting VM_HUGETLB, so
 	 * is_vm_hugetlb_page tests below unmap_region go the right
 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
 	 * and ia64).
 	 */
 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
 	vma->vm_ops = &hugetlb_vm_ops;
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 	mutex_lock(&inode->i_mutex);
 	file_accessed(file);
 	ret = -ENOMEM;
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
 				vma->vm_flags))
 		goto out;
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
 		inode->i_size = len;
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 /*
  * Called under down_write(mmap_sem).
  */
 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 	if (flags & MAP_FIXED) {
 		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = TASK_UNMAPPED_BASE;
 	info.high_limit = TASK_SIZE;
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	return vm_unmapped_area(&info);
 }
 #endif
 static int
 hugetlbfs_read_actor(struct page *page, unsigned long offset,
 			char __user *buf, unsigned long count,
 			unsigned long size)
 {
 	char *kaddr;
 	unsigned long left, copied = 0;
 	int i, chunksize;
 	if (size > count)
 		size = count;
 	/* Find which 4k chunk and offset with in that chunk */
 	i = offset >> PAGE_CACHE_SHIFT;
 	offset = offset & ~PAGE_CACHE_MASK;
 	while (size) {
 		chunksize = PAGE_CACHE_SIZE;
 		if (offset)
 			chunksize -= offset;
 		if (chunksize > size)
 			chunksize = size;
 		kaddr = kmap(&page[i]);
 		left = __copy_to_user(buf, kaddr + offset, chunksize);
 		kunmap(&page[i]);
 		if (left) {
 			copied += (chunksize - left);
 			break;
 		}
 		offset = 0;
 		size -= chunksize;
 		buf += chunksize;
 		copied += chunksize;
 		i++;
 	}
 	return copied ? copied : -EFAULT;
 }
 /*
  * Support for read() - Find the page attached to f_mapping and copy out the
  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
  * since it has PAGE_CACHE_SIZE assumptions.
  */
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			      size_t len, loff_t *ppos)
 {
 	struct hstate *h = hstate_file(filp);
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long index = *ppos >> huge_page_shift(h);
 	unsigned long offset = *ppos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
 	/* validate length */
 	if (len == 0)
 		goto out;
 	for (;;) {
 		struct page *page;
 		unsigned long nr, ret;
 		int ra;
 		/* nr is the maximum number of bytes to copy from this page */
 		nr = huge_page_size(h);
 		isize = i_size_read(inode);
 		if (!isize)
 			goto out;
 		end_index = (isize - 1) >> huge_page_shift(h);
 		if (index >= end_index) {
 			if (index > end_index)
 				goto out;
 			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset)
 				goto out;
 		}
 		nr = nr - offset;
 		/* Find the page */
 		page = find_lock_page(mapping, index);
 		if (unlikely(page == NULL)) {
 			/*
 			 * We have a HOLE, zero out the user-buffer for the
 			 * length of the hole or request.
 			 */
 			ret = len < nr ? len : nr;
 			if (clear_user(buf, ret))
 				ra = -EFAULT;
 			else
 				ra = 0;
 		} else {
 			unlock_page(page);
 			/*
 			 * We have the page, copy it to user space buffer.
 			 */
 			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
 			ret = ra;
 			page_cache_release(page);
 		}
 		if (ra < 0) {
 			if (retval == 0)
 				retval = ra;
 			goto out;
 		}
 		offset += ret;
 		retval += ret;
 		len -= ret;
 		index += offset >> huge_page_shift(h);
 		offset &= ~huge_page_mask(h);
 		/* short read or no more work */
 		if ((ret != nr) || (len == 0))
 			break;
 	}
 out:
 	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
 	return retval;
 }
 static int hugetlbfs_write_begin(struct file *file,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	return -EINVAL;
 }
 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	BUG();
 	return -EINVAL;
 }
 static void truncate_huge_page(struct page *page)
 {
 	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
 	ClearPageUptodate(page);
 	delete_from_page_cache(page);
 }
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
 	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
 	const pgoff_t start = lstart >> huge_page_shift(h);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
 		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 			if (next == start)
 				break;
 			next = start;
 			continue;
 		}
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
 			lock_page(page);
 			if (page->index > next)
 				next = page->index;
 			++next;
 			truncate_huge_page(page);
 			unlock_page(page);
 			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
 	hugetlb_unreserve_pages(inode, start, freed);
 }
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
 	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 }
 static inline void
 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
 {
 	struct vm_area_struct *vma;
 	vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
 		unsigned long v_offset;
 		/*
 		 * Can the expression below overflow on 32-bit arches?
 		 * No, because the interval tree returns us only those vmas
 		 * which overlap the truncated area starting at pgoff,
 		 * and no vma on a 32-bit arch can span beyond the 4GB.
 		 */
 		if (vma->vm_pgoff < pgoff)
 			v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
 		else
 			v_offset = 0;
 		unmap_hugepage_range(vma, vma->vm_start + v_offset,
 				     vma->vm_end, NULL);
 	}
 }
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
 	struct hstate *h = hstate_inode(inode);
 	BUG_ON(offset & ~huge_page_mask(h));
 	pgoff = offset >> PAGE_SHIFT;
 	i_size_write(inode, offset);
 	mutex_lock(&mapping->i_mmap_mutex);
 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	mutex_unlock(&mapping->i_mmap_mutex);
 	truncate_hugepages(inode, offset);
 	return 0;
 }
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct hstate *h = hstate_inode(inode);
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 	BUG_ON(!inode);
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
 		if (attr->ia_size & ~huge_page_mask(h))
 			return -EINVAL;
 		error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			return error;
 	}
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
 static struct inode *hugetlbfs_get_root(struct super_block *sb,
 					struct hugetlbfs_config *config)
 {
 	struct inode *inode;
 	inode = new_inode(sb);
 	if (inode) {
 		struct hugetlbfs_inode_info *info;
 		inode->i_ino = get_next_ino();
 		inode->i_mode = S_IFDIR | config->mode;
 		inode->i_uid = config->uid;
 		inode->i_gid = config->gid;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, NULL);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
 		inc_nlink(inode);
 		lockdep_annotate_inode_mutex_key(inode);
 	}
 	return inode;
 }
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 					struct inode *dir,
 					umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	inode = new_inode(sb);
 	if (inode) {
 		struct hugetlbfs_inode_info *info;
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		/*
 		 * The policy is initialized here even if we are creating a
 		 * private inode because initialization simply creates an
 		 * an empty rb tree and calls spin_lock_init(), later when we
 		 * call mpol_free_shared_policy() it will just return because
 		 * the rb tree will still be empty.
 		 */
 		mpol_shared_policy_init(&info->policy, NULL);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
 			inode->i_op = &hugetlbfs_inode_operations;
 			inode->i_fop = &hugetlbfs_file_operations;
 			break;
 		case S_IFDIR:
 			inode->i_op = &hugetlbfs_dir_inode_operations;
 			inode->i_fop = &simple_dir_operations;
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
 			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
 			break;
 		}
 		lockdep_annotate_inode_mutex_key(inode);
 	}
 	return inode;
 }
 /*
  * File creation. Allocate an inode, and we're done..
  */
 static int hugetlbfs_mknod(struct inode *dir,
 			struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
 	if (inode) {
 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
 	}
 	return error;
 }
 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
 	return retval;
 }
 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
 {
 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
 }
 static int hugetlbfs_symlink(struct inode *dir,
 			struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
 	if (inode) {
 		int l = strlen(symname)+1;
 		error = page_symlink(inode, symname, l);
 		if (!error) {
 			d_instantiate(dentry, inode);
 			dget(dentry);
 		} else
 			iput(inode);
 	}
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	return error;
 }
 /*
  * mark the head page dirty
  */
 static int hugetlbfs_set_page_dirty(struct page *page)
 {
 	struct page *head = compound_head(page);
 	SetPageDirty(head);
 	return 0;
 }
 static int hugetlbfs_migrate_page(struct address_space *mapping,
 				struct page *newpage, struct page *page,
 				enum migrate_mode mode)
 {
 	int rc;
 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 	migrate_page_copy(newpage, page);
 	return MIGRATEPAGE_SUCCESS;
 }
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 	struct hstate *h = hstate_inode(dentry->d_inode);
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
 		spin_lock(&sbinfo->stat_lock);
 		/* If no limits set, just report 0 for max/free/used
 		 * blocks, like simple_statfs() */
 		if (sbinfo->spool) {
 			long free_pages;
 			spin_lock(&sbinfo->spool->lock);
 			buf->f_blocks = sbinfo->spool->max_hpages;
 			free_pages = sbinfo->spool->max_hpages
 				- sbinfo->spool->used_hpages;
 			buf->f_bavail = buf->f_bfree = free_pages;
 			spin_unlock(&sbinfo->spool->lock);
 			buf->f_files = sbinfo->max_inodes;
 			buf->f_ffree = sbinfo->free_inodes;
 		}
 		spin_unlock(&sbinfo->stat_lock);
 	}
 	buf->f_namelen = NAME_MAX;
 	return 0;
 }
 static void hugetlbfs_put_super(struct super_block *sb)
 {
 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
 	if (sbi) {
 		sb->s_fs_info = NULL;
 		if (sbi->spool)
 			hugepage_put_subpool(sbi->spool);
 		kfree(sbi);
 	}
 }
 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
 {
 	if (sbinfo->free_inodes >= 0) {
 		spin_lock(&sbinfo->stat_lock);
 		if (unlikely(!sbinfo->free_inodes)) {
 			spin_unlock(&sbinfo->stat_lock);
 			return 0;
 		}
 		sbinfo->free_inodes--;
 		spin_unlock(&sbinfo->stat_lock);
 	}
 	return 1;
 }
 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
 {
 	if (sbinfo->free_inodes >= 0) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_inodes++;
 		spin_unlock(&sbinfo->stat_lock);
 	}
 }
 static struct kmem_cache *hugetlbfs_inode_cachep;
 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
 	struct hugetlbfs_inode_info *p;
 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
 		return NULL;
 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
 	if (unlikely(!p)) {
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
 	return &p->vfs_inode;
 }
 static void hugetlbfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 static const struct address_space_operations hugetlbfs_aops = {
 	.write_begin	= hugetlbfs_write_begin,
 	.write_end	= hugetlbfs_write_end,
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
 	.migratepage    = hugetlbfs_migrate_page,
 };
 static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 	inode_init_once(&ei->vfs_inode);
 }
 const struct file_operations hugetlbfs_file_operations = {
 	.read			= hugetlbfs_read,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek		= default_llseek,
 };
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
 	.create		= hugetlbfs_create,
 	.lookup		= simple_lookup,
 	.link		= simple_link,
 	.unlink		= simple_unlink,
 	.symlink	= hugetlbfs_symlink,
 	.mkdir		= hugetlbfs_mkdir,
 	.rmdir		= simple_rmdir,
 	.mknod		= hugetlbfs_mknod,
 	.rename		= simple_rename,
 	.setattr	= hugetlbfs_setattr,
 };
 static const struct inode_operations hugetlbfs_inode_operations = {
 	.setattr	= hugetlbfs_setattr,
 };
 static const struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
 	.evict_inode	= hugetlbfs_evict_inode,
 	.statfs		= hugetlbfs_statfs,
 	.put_super	= hugetlbfs_put_super,
 	.show_options	= generic_show_options,
 };
 static int
 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 {
 	char *p, *rest;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
 	unsigned long long size = 0;
 	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
 	if (!options)
 		return 0;
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_uid:
 			if (match_int(&args[0], &option))
  				goto bad_val;
 			pconfig->uid = make_kuid(current_user_ns(), option);
 			if (!uid_valid(pconfig->uid))
 				goto bad_val;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
  				goto bad_val;
 			pconfig->gid = make_kgid(current_user_ns(), option);
 			if (!gid_valid(pconfig->gid))
 				goto bad_val;
 			break;
 		case Opt_mode:
 			if (match_octal(&args[0], &option))
  				goto bad_val;
 			pconfig->mode = option & 01777U;
 			break;
 		case Opt_size: {
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
 			size = memparse(args[0].from, &rest);
 			setsize = SIZE_STD;
 			if (*rest == '%')
 				setsize = SIZE_PERCENT;
 			break;
 		}
 		case Opt_nr_inodes:
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
 			pconfig->nr_inodes = memparse(args[0].from, &rest);
 			break;
 		case Opt_pagesize: {
 			unsigned long ps;
 			ps = memparse(args[0].from, &rest);
 			pconfig->hstate = size_to_hstate(ps);
 			if (!pconfig->hstate) {
 				printk(KERN_ERR
 				"hugetlbfs: Unsupported page size %lu MB\n",
 					ps >> 20);
 				return -EINVAL;
 			}
 			break;
 		}
 		default:
 			printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
 				 p);
 			return -EINVAL;
 			break;
 		}
 	}
 	/* Do size after hstate is set up */
 	if (setsize > NO_SIZE) {
 		struct hstate *h = pconfig->hstate;
 		if (setsize == SIZE_PERCENT) {
 			size <<= huge_page_shift(h);
 			size *= h->max_huge_pages;
 			do_div(size, 100);
 		}
 		pconfig->nr_blocks = (size >> huge_page_shift(h));
 	}
 	return 0;
 bad_val:
  	printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
 	       args[0].from, p);
  	return -EINVAL;
 }
 static int
 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int ret;
 	struct hugetlbfs_config config;
 	struct hugetlbfs_sb_info *sbinfo;
 	save_mount_options(sb, data);
 	config.nr_blocks = -1; /* No limit on size by default */
 	config.nr_inodes = -1; /* No limit on number of inodes by default */
 	config.uid = current_fsuid();
 	config.gid = current_fsgid();
 	config.mode = 0755;
 	config.hstate = &default_hstate;
 	ret = hugetlbfs_parse_options(data, &config);
 	if (ret)
 		return ret;
 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
 	if (!sbinfo)
 		return -ENOMEM;
 	sb->s_fs_info = sbinfo;
 	sbinfo->hstate = config.hstate;
 	spin_lock_init(&sbinfo->stat_lock);
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
 	sbinfo->spool = NULL;
 	if (config.nr_blocks != -1) {
 		sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
 		if (!sbinfo->spool)
 			goto out_free;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = huge_page_size(config.hstate);
 	sb->s_blocksize_bits = huge_page_shift(config.hstate);
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
 	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
 	if (!sb->s_root)
 		goto out_free;
 	return 0;
 out_free:
 	if (sbinfo->spool)
 		kfree(sbinfo->spool);
 	kfree(sbinfo);
 	return -ENOMEM;
 }
 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
 }
 static struct file_system_type hugetlbfs_fs_type = {
 	.name		= "hugetlbfs",
 	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("hugetlbfs");
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 static int can_do_hugetlb_shm(void)
 {
 	kgid_t shm_group;
 	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
 	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
 }
 static int get_hstate_idx(int page_size_log)
 {
-	struct hstate *h;
+	struct hstate *h = hstate_sizelog(page_size_log);
-	if (!page_size_log)
-		return default_hstate_idx;
-	h = size_to_hstate(1 << page_size_log);
 	if (!h)
 		return -1;
 	return h - hstates;
 }
 static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
 				dentry->d_name.name);
 }
 static struct dentry_operations anon_ops = {
 	.d_dname = hugetlb_dname
 };
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+/*
-				size_t size, vm_flags_t acctflag,
+ * Note that size should be aligned to proper hugepage size in caller side,
-				struct user_struct **user,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag, struct user_struct **user,
 				int creat_flags, int page_size_log)
 {
 	struct file *file = ERR_PTR(-ENOMEM);
 	struct inode *inode;
 	struct path path;
 	struct super_block *sb;
 	struct qstr quick_string;
-	struct hstate *hstate;
-	unsigned long num_pages;
 	int hstate_idx;
 	hstate_idx = get_hstate_idx(page_size_log);
 	if (hstate_idx < 0)
 		return ERR_PTR(-ENODEV);
 	*user = NULL;
 	if (!hugetlbfs_vfsmount[hstate_idx])
 		return ERR_PTR(-ENOENT);
 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
 		*user = current_user();
 		if (user_shm_lock(size, *user)) {
 			task_lock(current);
 			printk_once(KERN_WARNING
 				"%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
 				current->comm, current->pid);
 			task_unlock(current);
 		} else {
 			*user = NULL;
 			return ERR_PTR(-EPERM);
 		}
 	}
 	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
 	quick_string.name = name;
 	quick_string.len = strlen(quick_string.name);
 	quick_string.hash = 0;
 	path.dentry = d_alloc_pseudo(sb, &quick_string);
 	if (!path.dentry)
 		goto out_shm_unlock;
 	d_set_d_op(path.dentry, &anon_ops);
 	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
 	file = ERR_PTR(-ENOSPC);
 	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
 		goto out_dentry;
-	hstate = hstate_inode(inode);
-	size += addr & ~huge_page_mask(hstate);
-	num_pages = ALIGN(size, huge_page_size(hstate)) >>
-			huge_page_shift(hstate);
 	file = ERR_PTR(-ENOMEM);
-	if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
 	clear_nlink(inode);
 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
 			&hugetlbfs_file_operations);
 	if (IS_ERR(file))
 		goto out_dentry; /* inode is already attached */
 	return file;
 out_inode:
 	iput(inode);
 out_dentry:
 	path_put(&path);
 out_shm_unlock:
 	if (*user) {
 		user_shm_unlock(size, *user);
 		*user = NULL;
 	}
 	return file;
 }
 static int __init init_hugetlbfs_fs(void)
 {
 	struct hstate *h;
 	int error;
 	int i;
 	error = bdi_init(&hugetlbfs_backing_dev_info);
 	if (error)
 		return error;
 	error = -ENOMEM;
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
 					0, 0, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		goto out2;
 	error = register_filesystem(&hugetlbfs_fs_type);
 	if (error)
 		goto out;
 	i = 0;
 	for_each_hstate(h) {
 		char buf[50];
 		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
 		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
 		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
 							buf);
 		if (IS_ERR(hugetlbfs_vfsmount[i])) {
 			pr_err("hugetlb: Cannot mount internal hugetlbfs for "
 				"page size %uK", ps_kb);
 			error = PTR_ERR(hugetlbfs_vfsmount[i]);
 			hugetlbfs_vfsmount[i] = NULL;
 		}
 		i++;
 	}
 	/* Non default hstates are optional */
 	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
 		return 0;
  out:
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
  out2:
 	bdi_destroy(&hugetlbfs_backing_dev_info);
 	return error;
 }
 static void __exit exit_hugetlbfs_fs(void)
 {
 	struct hstate *h;
 	int i;
 	/*
 	 * Make sure all delayed rcu free inodes are flushed before we
 	 * destroy cache.
 	 */
 	rcu_barrier();
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	i = 0;
 	for_each_hstate(h)
 		kern_unmount(hugetlbfs_vfsmount[i++]);
 	unregister_filesystem(&hugetlbfs_fs_type);
 	bdi_destroy(&hugetlbfs_backing_dev_info);
 }

 #ifndef _LINUX_HUGETLB_H
 #define _LINUX_HUGETLB_H
 #include <linux/mm_types.h>
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
 #include <linux/cgroup.h>
 struct ctl_table;
 struct user_struct;
 struct mmu_gather;
 #ifdef CONFIG_HUGETLB_PAGE
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 struct hugepage_subpool {
 	spinlock_t lock;
 	long count;
 	long max_hpages, used_hpages;
 };
 extern spinlock_t hugetlb_lock;
 extern int hugetlb_max_hstate __read_mostly;
 #define for_each_hstate(h) \
 	for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 int PageHuge(struct page *page);
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 #ifdef CONFIG_NUMA
 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
 			 unsigned long *, unsigned long *, long, unsigned int);
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end,
 			  struct page *ref_page);
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				unsigned long start, unsigned long end,
 				struct page *ref_page);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages;
 /* arch callbacks */
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int write);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 #else /* !CONFIG_HUGETLB_PAGE */
 static inline int PageHuge(struct page *page)
 {
 	return 0;
 }
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
 static inline unsigned long hugetlb_total_pages(void)
 {
 	return 0;
 }
 #define follow_hugetlb_page(m,v,p,vs,a,b,i,w)	({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
 #define hugetlb_report_node_meminfo(n, buf)	0
 static inline void hugetlb_show_meminfo(void)
 {
 }
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address)	0
 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 {
 	return 0;
 }
 static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot)
 {
 	return 0;
 }
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, struct page *ref_page)
 {
 	BUG();
 }
 static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, struct page *ref_page)
 {
 	BUG();
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 #define HUGETLB_ANON_FILE "anon_hugepage"
 enum {
 	/*
 	 * The file will be used as an shm file so shmfs accounting rules
 	 * apply
 	 */
 	HUGETLB_SHMFS_INODE     = 1,
 	/*
 	 * The file is being created on the internal vfs mount and shmfs
 	 * accounting rules do not apply
 	 */
 	HUGETLB_ANONHUGE_INODE  = 2,
 };
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_sb_info {
 	long	max_inodes;   /* inodes allowed */
 	long	free_inodes;  /* inodes free */
 	spinlock_t	stat_lock;
 	struct hstate *hstate;
 	struct hugepage_subpool *spool;
 };
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
-				size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags,
 				int page_size_log);
 static inline int is_file_hugepages(struct file *file)
 {
 	if (file->f_op == &hugetlbfs_file_operations)
 		return 1;
 	if (is_file_shm_hugepages(file))
 		return 1;
 	return 0;
 }
 #else /* !CONFIG_HUGETLBFS */
 #define is_file_hugepages(file)			0
 static inline struct file *
-hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
+hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
-		vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+		struct user_struct **user, int creat_flags,
 		int page_size_log)
 {
 	return ERR_PTR(-ENOSYS);
 }
 #endif /* !CONFIG_HUGETLBFS */
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 #ifdef CONFIG_HUGETLB_PAGE
 #define HSTATE_NAME_LEN 32
 /* Defines one hugetlb page size */
 struct hstate {
 	int next_nid_to_alloc;
 	int next_nid_to_free;
 	unsigned int order;
 	unsigned long mask;
 	unsigned long max_huge_pages;
 	unsigned long nr_huge_pages;
 	unsigned long free_huge_pages;
 	unsigned long resv_huge_pages;
 	unsigned long surplus_huge_pages;
 	unsigned long nr_overcommit_huge_pages;
 	struct list_head hugepage_activelist;
 	struct list_head hugepage_freelists[MAX_NUMNODES];
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files[5];
 #endif
 	char name[HSTATE_NAME_LEN];
 };
 struct huge_bootmem_page {
 	struct list_head list;
 	struct hstate *hstate;
 #ifdef CONFIG_HIGHMEM
 	phys_addr_t phys;
 #endif
 };
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 void __init hugetlb_add_hstate(unsigned order);
 struct hstate *size_to_hstate(unsigned long size);
 #ifndef HUGE_MAX_HSTATE
 #define HUGE_MAX_HSTATE 1
 #endif
 extern struct hstate hstates[HUGE_MAX_HSTATE];
 extern unsigned int default_hstate_idx;
 #define default_hstate (hstates[default_hstate_idx])
 static inline struct hstate *hstate_inode(struct inode *i)
 {
 	struct hugetlbfs_sb_info *hsb;
 	hsb = HUGETLBFS_SB(i->i_sb);
 	return hsb->hstate;
 }
 static inline struct hstate *hstate_file(struct file *f)
 {
 	return hstate_inode(file_inode(f));
 }
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+	if (!page_size_log)
+		return &default_hstate;
+	return size_to_hstate(1 << page_size_log);
+}
 static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 {
 	return hstate_file(vma->vm_file);
 }
 static inline unsigned long huge_page_size(struct hstate *h)
 {
 	return (unsigned long)PAGE_SIZE << h->order;
 }
 extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);
 extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);
 static inline unsigned long huge_page_mask(struct hstate *h)
 {
 	return h->mask;
 }
 static inline unsigned int huge_page_order(struct hstate *h)
 {
 	return h->order;
 }
 static inline unsigned huge_page_shift(struct hstate *h)
 {
 	return h->order + PAGE_SHIFT;
 }
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
 	return 1 << h->order;
 }
 static inline unsigned int blocks_per_huge_page(struct hstate *h)
 {
 	return huge_page_size(h) / 512;
 }
 #include <asm/hugetlb.h>
 #ifndef arch_make_huge_pte
 static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 				       struct page *page, int writable)
 {
 	return entry;
 }
 #endif
 static inline struct hstate *page_hstate(struct page *page)
 {
 	return size_to_hstate(PAGE_SIZE << compound_order(page));
 }
 static inline unsigned hstate_index_to_shift(unsigned index)
 {
 	return hstates[index].order + PAGE_SHIFT;
 }
 static inline int hstate_index(struct hstate *h)
 {
 	return h - hstates;
 }
-#else
+#else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
+#define hstate_sizelog(s) NULL
 #define hstate_vma(v) NULL
 #define hstate_inode(i) NULL
 #define huge_page_size(h) PAGE_SIZE
 #define huge_page_mask(h) PAGE_MASK
 #define vma_kernel_pagesize(v) PAGE_SIZE
 #define vma_mmu_pagesize(v) PAGE_SIZE
 #define huge_page_order(h) 0
 #define huge_page_shift(h) PAGE_SHIFT
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
 	return 1;
 }
 #define hstate_index_to_shift(index) 0
 #define hstate_index(h) 0
-#endif
+#endif	/* CONFIG_HUGETLB_PAGE */
 #endif /* _LINUX_HUGETLB_H */

 /*
  * linux/ipc/shm.c
  * Copyright (C) 1992, 1993 Krishna Balasubramanian
  *	 Many improvements/fixes by Bruno Haible.
  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
  *
  * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
  * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
  * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
  *
  * support for audit of ipc object properties and permission changes
  * Dustin Kirkland <dustin.kirkland@us.ibm.com>
  *
  * namespaces support
  * OpenVZ, SWsoft Inc.
  * Pavel Emelianov <xemul@openvz.org>
  */
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/shm.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/shmem_fs.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
 #include <asm/uaccess.h>
 #include "util.h"
 struct shm_file_data {
 	int id;
 	struct ipc_namespace *ns;
 	struct file *file;
 	const struct vm_operations_struct *vm_ops;
 };
 #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
 static const struct file_operations shm_file_operations;
 static const struct vm_operations_struct shm_vm_ops;
 #define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
 #define shm_unlock(shp)			\
 	ipc_unlock(&(shp)->shm_perm)
 static int newseg(struct ipc_namespace *, struct ipc_params *);
 static void shm_open(struct vm_area_struct *vma);
 static void shm_close(struct vm_area_struct *vma);
 static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
 #ifdef CONFIG_PROC_FS
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
 #endif
 void shm_init_ns(struct ipc_namespace *ns)
 {
 	ns->shm_ctlmax = SHMMAX;
 	ns->shm_ctlall = SHMALL;
 	ns->shm_ctlmni = SHMMNI;
 	ns->shm_rmid_forced = 0;
 	ns->shm_tot = 0;
 	ipc_init_ids(&shm_ids(ns));
 }
 /*
  * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
  * Only shm_ids.rw_mutex remains locked on exit.
  */
 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct shmid_kernel *shp;
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	if (shp->shm_nattch){
 		shp->shm_perm.mode |= SHM_DEST;
 		/* Do not find it any more */
 		shp->shm_perm.key = IPC_PRIVATE;
 		shm_unlock(shp);
 	} else
 		shm_destroy(ns, shp);
 }
 #ifdef CONFIG_IPC_NS
 void shm_exit_ns(struct ipc_namespace *ns)
 {
 	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
 	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
 }
 #endif
 static int __init ipc_ns_init(void)
 {
 	shm_init_ns(&init_ipc_ns);
 	return 0;
 }
 pure_initcall(ipc_ns_init);
 void __init shm_init (void)
 {
 	ipc_init_proc_interface("sysvipc/shm",
 #if BITS_PER_LONG <= 32
 				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
 #else
 				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
 #endif
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 /*
  * shm_lock_(check_) routines are called in the paths where the rw_mutex
  * is not necessarily held.
  */
 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 {
 	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
 	if (IS_ERR(ipcp))
 		return (struct shmid_kernel *)ipcp;
 	return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
 {
 	rcu_read_lock();
 	spin_lock(&ipcp->shm_perm.lock);
 }
 static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
 						int id)
 {
 	struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id);
 	if (IS_ERR(ipcp))
 		return (struct shmid_kernel *)ipcp;
 	return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 {
 	ipc_rmid(&shm_ids(ns), &s->shm_perm);
 }
 /* This is called by fork, once for every shm attach. */
 static void shm_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct shmid_kernel *shp;
 	shp = shm_lock(sfd->ns, sfd->id);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_atim = get_seconds();
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_nattch++;
 	shm_unlock(shp);
 }
 /*
  * shm_destroy - free the struct shmid_kernel
  *
  * @ns: namespace
  * @shp: struct to free
  *
  * It has to be called with shp and shm_ids.rw_mutex (writer) locked,
  * but returns with shp unlocked and freed.
  */
 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 {
 	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	shm_rmid(ns, shp);
 	shm_unlock(shp);
 	if (!is_file_hugepages(shp->shm_file))
 		shmem_lock(shp->shm_file, 0, shp->mlock_user);
 	else if (shp->mlock_user)
 		user_shm_unlock(file_inode(shp->shm_file)->i_size,
 						shp->mlock_user);
 	fput (shp->shm_file);
 	security_shm_free(shp);
 	ipc_rcu_putref(shp);
 }
 /*
  * shm_may_destroy - identifies whether shm segment should be destroyed now
  *
  * Returns true if and only if there are no active users of the segment and
  * one of the following is true:
  *
  * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
  *
  * 2) sysctl kernel.shm_rmid_forced is set to 1.
  */
 static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 {
 	return (shp->shm_nattch == 0) &&
 	       (ns->shm_rmid_forced ||
 		(shp->shm_perm.mode & SHM_DEST));
 }
 /*
  * remove the attach descriptor vma.
  * free memory for segment if it is marked destroyed.
  * The descriptor has already been removed from the current->mm->mmap list
  * and will later be kfree()d.
  */
 static void shm_close(struct vm_area_struct *vma)
 {
 	struct file * file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct shmid_kernel *shp;
 	struct ipc_namespace *ns = sfd->ns;
 	down_write(&shm_ids(ns).rw_mutex);
 	/* remove from the list of attaches of the shm segment */
 	shp = shm_lock(ns, sfd->id);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
 	if (shm_may_destroy(ns, shp))
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
 	up_write(&shm_ids(ns).rw_mutex);
 }
 /* Called with ns->shm_ids(ns).rw_mutex locked */
 static int shm_try_destroy_current(int id, void *p, void *data)
 {
 	struct ipc_namespace *ns = data;
 	struct kern_ipc_perm *ipcp = p;
 	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	if (shp->shm_creator != current)
 		return 0;
 	/*
 	 * Mark it as orphaned to destroy the segment when
 	 * kernel.shm_rmid_forced is changed.
 	 * It is noop if the following shm_may_destroy() returns true.
 	 */
 	shp->shm_creator = NULL;
 	/*
 	 * Don't even try to destroy it.  If shm_rmid_forced=0 and IPC_RMID
 	 * is not set, it shouldn't be deleted here.
 	 */
 	if (!ns->shm_rmid_forced)
 		return 0;
 	if (shm_may_destroy(ns, shp)) {
 		shm_lock_by_ptr(shp);
 		shm_destroy(ns, shp);
 	}
 	return 0;
 }
 /* Called with ns->shm_ids(ns).rw_mutex locked */
 static int shm_try_destroy_orphaned(int id, void *p, void *data)
 {
 	struct ipc_namespace *ns = data;
 	struct kern_ipc_perm *ipcp = p;
 	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	/*
 	 * We want to destroy segments without users and with already
 	 * exit'ed originating process.
 	 *
 	 * As shp->* are changed under rw_mutex, it's safe to skip shp locking.
 	 */
 	if (shp->shm_creator != NULL)
 		return 0;
 	if (shm_may_destroy(ns, shp)) {
 		shm_lock_by_ptr(shp);
 		shm_destroy(ns, shp);
 	}
 	return 0;
 }
 void shm_destroy_orphaned(struct ipc_namespace *ns)
 {
 	down_write(&shm_ids(ns).rw_mutex);
 	if (shm_ids(ns).in_use)
 		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
 	up_write(&shm_ids(ns).rw_mutex);
 }
 void exit_shm(struct task_struct *task)
 {
 	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
 	if (shm_ids(ns).in_use == 0)
 		return;
 	/* Destroy all already created segments, but not mapped yet */
 	down_write(&shm_ids(ns).rw_mutex);
 	if (shm_ids(ns).in_use)
 		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
 	up_write(&shm_ids(ns).rw_mutex);
 }
 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	return sfd->vm_ops->fault(vma, vmf);
 }
 #ifdef CONFIG_NUMA
 static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	int err = 0;
 	if (sfd->vm_ops->set_policy)
 		err = sfd->vm_ops->set_policy(vma, new);
 	return err;
 }
 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
 					unsigned long addr)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct mempolicy *pol = NULL;
 	if (sfd->vm_ops->get_policy)
 		pol = sfd->vm_ops->get_policy(vma, addr);
 	else if (vma->vm_policy)
 		pol = vma->vm_policy;
 	return pol;
 }
 #endif
 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 	int ret;
 	ret = sfd->file->f_op->mmap(sfd->file, vma);
 	if (ret != 0)
 		return ret;
 	sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
 	BUG_ON(!sfd->vm_ops->fault);
 #endif
 	vma->vm_ops = &shm_vm_ops;
 	shm_open(vma);
 	return ret;
 }
 static int shm_release(struct inode *ino, struct file *file)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 	put_ipc_ns(sfd->ns);
 	shm_file_data(file) = NULL;
 	kfree(sfd);
 	return 0;
 }
 static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 	if (!sfd->file->f_op->fsync)
 		return -EINVAL;
 	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
 }
 static long shm_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 	if (!sfd->file->f_op->fallocate)
 		return -EOPNOTSUPP;
 	return sfd->file->f_op->fallocate(file, mode, offset, len);
 }
 static unsigned long shm_get_unmapped_area(struct file *file,
 	unsigned long addr, unsigned long len, unsigned long pgoff,
 	unsigned long flags)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
 	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
 						pgoff, flags);
 }
 static const struct file_operations shm_file_operations = {
 	.mmap		= shm_mmap,
 	.fsync		= shm_fsync,
 	.release	= shm_release,
 #ifndef CONFIG_MMU
 	.get_unmapped_area	= shm_get_unmapped_area,
 #endif
 	.llseek		= noop_llseek,
 	.fallocate	= shm_fallocate,
 };
 static const struct file_operations shm_file_operations_huge = {
 	.mmap		= shm_mmap,
 	.fsync		= shm_fsync,
 	.release	= shm_release,
 	.get_unmapped_area	= shm_get_unmapped_area,
 	.llseek		= noop_llseek,
 	.fallocate	= shm_fallocate,
 };
 int is_file_shm_hugepages(struct file *file)
 {
 	return file->f_op == &shm_file_operations_huge;
 }
 static const struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
 	.fault	= shm_fault,
 #if defined(CONFIG_NUMA)
 	.set_policy = shm_set_policy,
 	.get_policy = shm_get_policy,
 #endif
 };
 /**
  * newseg - Create a new shared memory segment
  * @ns: namespace
  * @params: ptr to the structure that contains key, size and shmflg
  *
  * Called with shm_ids.rw_mutex held as a writer.
  */
 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 {
 	key_t key = params->key;
 	int shmflg = params->flg;
 	size_t size = params->u.size;
 	int error;
 	struct shmid_kernel *shp;
 	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct file * file;
 	char name[13];
 	int id;
 	vm_flags_t acctflag = 0;
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
 	if (ns->shm_tot + numpages > ns->shm_ctlall)
 		return -ENOSPC;
 	shp = ipc_rcu_alloc(sizeof(*shp));
 	if (!shp)
 		return -ENOMEM;
 	shp->shm_perm.key = key;
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->mlock_user = NULL;
 	shp->shm_perm.security = NULL;
 	error = security_shm_alloc(shp);
 	if (error) {
 		ipc_rcu_putref(shp);
 		return error;
 	}
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
+		struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
+						& SHM_HUGE_MASK);
+		size_t hugesize = ALIGN(size, huge_page_size(hs));
 		/* hugetlb_file_setup applies strict accounting */
 		if (shmflg & SHM_NORESERVE)
 			acctflag = VM_NORESERVE;
-		file = hugetlb_file_setup(name, 0, size, acctflag,
+		file = hugetlb_file_setup(name, hugesize, acctflag,
 				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
 				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
 	} else {
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
 		 */
 		if  ((shmflg & SHM_NORESERVE) &&
 				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
 			acctflag = VM_NORESERVE;
 		file = shmem_file_setup(name, size, acctflag);
 	}
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
 		error = id;
 		goto no_id;
 	}
 	shp->shm_cprid = task_tgid_vnr(current);
 	shp->shm_lprid = 0;
 	shp->shm_atim = shp->shm_dtim = 0;
 	shp->shm_ctim = get_seconds();
 	shp->shm_segsz = size;
 	shp->shm_nattch = 0;
 	shp->shm_file = file;
 	shp->shm_creator = current;
 	/*
 	 * shmid gets reported as "inode#" in /proc/pid/maps.
 	 * proc-ps tools use this. Changing this will break them.
 	 */
 	file_inode(file)->i_ino = shp->shm_perm.id;
 	ns->shm_tot += numpages;
 	error = shp->shm_perm.id;
 	shm_unlock(shp);
 	return error;
 no_id:
 	if (is_file_hugepages(file) && shp->mlock_user)
 		user_shm_unlock(size, shp->mlock_user);
 	fput(file);
 no_file:
 	security_shm_free(shp);
 	ipc_rcu_putref(shp);
 	return error;
 }
 /*
  * Called with shm_ids.rw_mutex and ipcp locked.
  */
 static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
 {
 	struct shmid_kernel *shp;
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	return security_shm_associate(shp, shmflg);
 }
 /*
  * Called with shm_ids.rw_mutex and ipcp locked.
  */
 static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
 				struct ipc_params *params)
 {
 	struct shmid_kernel *shp;
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	if (shp->shm_segsz < params->u.size)
 		return -EINVAL;
 	return 0;
 }
 SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
 {
 	struct ipc_namespace *ns;
 	struct ipc_ops shm_ops;
 	struct ipc_params shm_params;
 	ns = current->nsproxy->ipc_ns;
 	shm_ops.getnew = newseg;
 	shm_ops.associate = shm_security;
 	shm_ops.more_checks = shm_more_checks;
 	shm_params.key = key;
 	shm_params.flg = shmflg;
 	shm_params.u.size = size;
 	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
 }
 static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
 {
 	switch(version) {
 	case IPC_64:
 		return copy_to_user(buf, in, sizeof(*in));
 	case IPC_OLD:
 	    {
 		struct shmid_ds out;
 		memset(&out, 0, sizeof(out));
 		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
 		out.shm_segsz	= in->shm_segsz;
 		out.shm_atime	= in->shm_atime;
 		out.shm_dtime	= in->shm_dtime;
 		out.shm_ctime	= in->shm_ctime;
 		out.shm_cpid	= in->shm_cpid;
 		out.shm_lpid	= in->shm_lpid;
 		out.shm_nattch	= in->shm_nattch;
 		return copy_to_user(buf, &out, sizeof(out));
 	    }
 	default:
 		return -EINVAL;
 	}
 }
 static inline unsigned long
 copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
 {
 	switch(version) {
 	case IPC_64:
 		if (copy_from_user(out, buf, sizeof(*out)))
 			return -EFAULT;
 		return 0;
 	case IPC_OLD:
 	    {
 		struct shmid_ds tbuf_old;
 		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
 			return -EFAULT;
 		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
 		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
 		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
 		return 0;
 	    }
 	default:
 		return -EINVAL;
 	}
 }
 static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
 {
 	switch(version) {
 	case IPC_64:
 		return copy_to_user(buf, in, sizeof(*in));
 	case IPC_OLD:
 	    {
 		struct shminfo out;
 		if(in->shmmax > INT_MAX)
 			out.shmmax = INT_MAX;
 		else
 			out.shmmax = (int)in->shmmax;
 		out.shmmin	= in->shmmin;
 		out.shmmni	= in->shmmni;
 		out.shmseg	= in->shmseg;
 		out.shmall	= in->shmall;
 		return copy_to_user(buf, &out, sizeof(out));
 	    }
 	default:
 		return -EINVAL;
 	}
 }
 /*
  * Calculate and add used RSS and swap pages of a shm.
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_add_rss_swap(struct shmid_kernel *shp,
 	unsigned long *rss_add, unsigned long *swp_add)
 {
 	struct inode *inode;
 	inode = file_inode(shp->shm_file);
 	if (is_file_hugepages(shp->shm_file)) {
 		struct address_space *mapping = inode->i_mapping;
 		struct hstate *h = hstate_file(shp->shm_file);
 		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
 	} else {
 #ifdef CONFIG_SHMEM
 		struct shmem_inode_info *info = SHMEM_I(inode);
 		spin_lock(&info->lock);
 		*rss_add += inode->i_mapping->nrpages;
 		*swp_add += info->swapped;
 		spin_unlock(&info->lock);
 #else
 		*rss_add += inode->i_mapping->nrpages;
 #endif
 	}
 }
 /*
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 		unsigned long *swp)
 {
 	int next_id;
 	int total, in_use;
 	*rss = 0;
 	*swp = 0;
 	in_use = shm_ids(ns).in_use;
 	for (total = 0, next_id = 0; total < in_use; next_id++) {
 		struct kern_ipc_perm *ipc;
 		struct shmid_kernel *shp;
 		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
 		if (ipc == NULL)
 			continue;
 		shp = container_of(ipc, struct shmid_kernel, shm_perm);
 		shm_add_rss_swap(shp, rss, swp);
 		total++;
 	}
 }
 /*
  * This function handles some shmctl commands which require the rw_mutex
  * to be held in write mode.
  * NOTE: no locks must be held, the rw_mutex is taken inside this function.
  */
 static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 		       struct shmid_ds __user *buf, int version)
 {
 	struct kern_ipc_perm *ipcp;
 	struct shmid64_ds shmid64;
 	struct shmid_kernel *shp;
 	int err;
 	if (cmd == IPC_SET) {
 		if (copy_shmid_from_user(&shmid64, buf, version))
 			return -EFAULT;
 	}
 	ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd,
 			       &shmid64.shm_perm, 0);
 	if (IS_ERR(ipcp))
 		return PTR_ERR(ipcp);
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	err = security_shm_shmctl(shp, cmd);
 	if (err)
 		goto out_unlock;
 	switch (cmd) {
 	case IPC_RMID:
 		do_shm_rmid(ns, ipcp);
 		goto out_up;
 	case IPC_SET:
 		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
 		if (err)
 			goto out_unlock;
 		shp->shm_ctim = get_seconds();
 		break;
 	default:
 		err = -EINVAL;
 	}
 out_unlock:
 	shm_unlock(shp);
 out_up:
 	up_write(&shm_ids(ns).rw_mutex);
 	return err;
 }
 SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 {
 	struct shmid_kernel *shp;
 	int err, version;
 	struct ipc_namespace *ns;
 	if (cmd < 0 || shmid < 0) {
 		err = -EINVAL;
 		goto out;
 	}
 	version = ipc_parse_version(&cmd);
 	ns = current->nsproxy->ipc_ns;
 	switch (cmd) { /* replace with proc interface ? */
 	case IPC_INFO:
 	{
 		struct shminfo64 shminfo;
 		err = security_shm_shmctl(NULL, cmd);
 		if (err)
 			return err;
 		memset(&shminfo, 0, sizeof(shminfo));
 		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
 		shminfo.shmmax = ns->shm_ctlmax;
 		shminfo.shmall = ns->shm_ctlall;
 		shminfo.shmmin = SHMMIN;
 		if(copy_shminfo_to_user (buf, &shminfo, version))
 			return -EFAULT;
 		down_read(&shm_ids(ns).rw_mutex);
 		err = ipc_get_maxid(&shm_ids(ns));
 		up_read(&shm_ids(ns).rw_mutex);
 		if(err<0)
 			err = 0;
 		goto out;
 	}
 	case SHM_INFO:
 	{
 		struct shm_info shm_info;
 		err = security_shm_shmctl(NULL, cmd);
 		if (err)
 			return err;
 		memset(&shm_info, 0, sizeof(shm_info));
 		down_read(&shm_ids(ns).rw_mutex);
 		shm_info.used_ids = shm_ids(ns).in_use;
 		shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
 		shm_info.shm_tot = ns->shm_tot;
 		shm_info.swap_attempts = 0;
 		shm_info.swap_successes = 0;
 		err = ipc_get_maxid(&shm_ids(ns));
 		up_read(&shm_ids(ns).rw_mutex);
 		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
 			err = -EFAULT;
 			goto out;
 		}
 		err = err < 0 ? 0 : err;
 		goto out;
 	}
 	case SHM_STAT:
 	case IPC_STAT:
 	{
 		struct shmid64_ds tbuf;
 		int result;
 		if (cmd == SHM_STAT) {
 			shp = shm_lock(ns, shmid);
 			if (IS_ERR(shp)) {
 				err = PTR_ERR(shp);
 				goto out;
 			}
 			result = shp->shm_perm.id;
 		} else {
 			shp = shm_lock_check(ns, shmid);
 			if (IS_ERR(shp)) {
 				err = PTR_ERR(shp);
 				goto out;
 			}
 			result = 0;
 		}
 		err = -EACCES;
 		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
 			goto out_unlock;
 		err = security_shm_shmctl(shp, cmd);
 		if (err)
 			goto out_unlock;
 		memset(&tbuf, 0, sizeof(tbuf));
 		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
 		tbuf.shm_segsz	= shp->shm_segsz;
 		tbuf.shm_atime	= shp->shm_atim;
 		tbuf.shm_dtime	= shp->shm_dtim;
 		tbuf.shm_ctime	= shp->shm_ctim;
 		tbuf.shm_cpid	= shp->shm_cprid;
 		tbuf.shm_lpid	= shp->shm_lprid;
 		tbuf.shm_nattch	= shp->shm_nattch;
 		shm_unlock(shp);
 		if(copy_shmid_to_user (buf, &tbuf, version))
 			err = -EFAULT;
 		else
 			err = result;
 		goto out;
 	}
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{
 		struct file *shm_file;
 		shp = shm_lock_check(ns, shmid);
 		if (IS_ERR(shp)) {
 			err = PTR_ERR(shp);
 			goto out;
 		}
 		audit_ipc_obj(&(shp->shm_perm));
 		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
 			kuid_t euid = current_euid();
 			err = -EPERM;
 			if (!uid_eq(euid, shp->shm_perm.uid) &&
 			    !uid_eq(euid, shp->shm_perm.cuid))
 				goto out_unlock;
 			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
 				goto out_unlock;
 		}
 		err = security_shm_shmctl(shp, cmd);
 		if (err)
 			goto out_unlock;
 		shm_file = shp->shm_file;
 		if (is_file_hugepages(shm_file))
 			goto out_unlock;
 		if (cmd == SHM_LOCK) {
 			struct user_struct *user = current_user();
 			err = shmem_lock(shm_file, 1, user);
 			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
 				shp->shm_perm.mode |= SHM_LOCKED;
 				shp->mlock_user = user;
 			}
 			goto out_unlock;
 		}
 		/* SHM_UNLOCK */
 		if (!(shp->shm_perm.mode & SHM_LOCKED))
 			goto out_unlock;
 		shmem_lock(shm_file, 0, shp->mlock_user);
 		shp->shm_perm.mode &= ~SHM_LOCKED;
 		shp->mlock_user = NULL;
 		get_file(shm_file);
 		shm_unlock(shp);
 		shmem_unlock_mapping(shm_file->f_mapping);
 		fput(shm_file);
 		goto out;
 	}
 	case IPC_RMID:
 	case IPC_SET:
 		err = shmctl_down(ns, shmid, cmd, buf, version);
 		return err;
 	default:
 		return -EINVAL;
 	}
 out_unlock:
 	shm_unlock(shp);
 out:
 	return err;
 }
 /*
  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
  *
  * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
  * "raddr" thing points to kernel space, and there has to be a wrapper around
  * this.
  */
 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 	      unsigned long shmlba)
 {
 	struct shmid_kernel *shp;
 	unsigned long addr;
 	unsigned long size;
 	struct file * file;
 	int    err;
 	unsigned long flags;
 	unsigned long prot;
 	int acc_mode;
 	struct ipc_namespace *ns;
 	struct shm_file_data *sfd;
 	struct path path;
 	fmode_t f_mode;
 	unsigned long populate = 0;
 	err = -EINVAL;
 	if (shmid < 0)
 		goto out;
 	else if ((addr = (ulong)shmaddr)) {
 		if (addr & (shmlba - 1)) {
 			if (shmflg & SHM_RND)
 				addr &= ~(shmlba - 1);	   /* round down */
 			else
 #ifndef __ARCH_FORCE_SHMLBA
 				if (addr & ~PAGE_MASK)
 #endif
 					goto out;
 		}
 		flags = MAP_SHARED | MAP_FIXED;
 	} else {
 		if ((shmflg & SHM_REMAP))
 			goto out;
 		flags = MAP_SHARED;
 	}
 	if (shmflg & SHM_RDONLY) {
 		prot = PROT_READ;
 		acc_mode = S_IRUGO;
 		f_mode = FMODE_READ;
 	} else {
 		prot = PROT_READ | PROT_WRITE;
 		acc_mode = S_IRUGO | S_IWUGO;
 		f_mode = FMODE_READ | FMODE_WRITE;
 	}
 	if (shmflg & SHM_EXEC) {
 		prot |= PROT_EXEC;
 		acc_mode |= S_IXUGO;
 	}
 	/*
 	 * We cannot rely on the fs check since SYSV IPC does have an
 	 * additional creator id...
 	 */
 	ns = current->nsproxy->ipc_ns;
 	shp = shm_lock_check(ns, shmid);
 	if (IS_ERR(shp)) {
 		err = PTR_ERR(shp);
 		goto out;
 	}
 	err = -EACCES;
 	if (ipcperms(ns, &shp->shm_perm, acc_mode))
 		goto out_unlock;
 	err = security_shm_shmat(shp, shmaddr, shmflg);
 	if (err)
 		goto out_unlock;
 	path = shp->shm_file->f_path;
 	path_get(&path);
 	shp->shm_nattch++;
 	size = i_size_read(path.dentry->d_inode);
 	shm_unlock(shp);
 	err = -ENOMEM;
 	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
 	if (!sfd)
 		goto out_put_dentry;
 	file = alloc_file(&path, f_mode,
 			  is_file_hugepages(shp->shm_file) ?
 				&shm_file_operations_huge :
 				&shm_file_operations);
 	err = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_free;
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
 	sfd->id = shp->shm_perm.id;
 	sfd->ns = get_ipc_ns(ns);
 	sfd->file = shp->shm_file;
 	sfd->vm_ops = NULL;
 	err = security_mmap_file(file, prot, flags);
 	if (err)
 		goto out_fput;
 	down_write(&current->mm->mmap_sem);
 	if (addr && !(shmflg & SHM_REMAP)) {
 		err = -EINVAL;
 		if (find_vma_intersection(current->mm, addr, addr + size))
 			goto invalid;
 		/*
 		 * If shm segment goes below stack, make sure there is some
 		 * space left for the stack to grow (at least 4 pages).
 		 */
 		if (addr < current->mm->start_stack &&
 		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 			goto invalid;
 	}
 	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
 	*raddr = addr;
 	err = 0;
 	if (IS_ERR_VALUE(addr))
 		err = (long)addr;
 invalid:
 	up_write(&current->mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, populate);
 out_fput:
 	fput(file);
 out_nattch:
 	down_write(&shm_ids(ns).rw_mutex);
 	shp = shm_lock(ns, shmid);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_nattch--;
 	if (shm_may_destroy(ns, shp))
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
 	up_write(&shm_ids(ns).rw_mutex);
 out:
 	return err;
 out_unlock:
 	shm_unlock(shp);
 	goto out;
 out_free:
 	kfree(sfd);
 out_put_dentry:
 	path_put(&path);
 	goto out_nattch;
 }
 SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
 {
 	unsigned long ret;
 	long err;
 	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
 	if (err)
 		return err;
 	force_successful_syscall_return();
 	return (long)ret;
 }
 /*
  * detach and kill segment if marked destroyed.
  * The work is done in shm_close.
  */
 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long addr = (unsigned long)shmaddr;
 	int retval = -EINVAL;
 #ifdef CONFIG_MMU
 	loff_t size = 0;
 	struct vm_area_struct *next;
 #endif
 	if (addr & ~PAGE_MASK)
 		return retval;
 	down_write(&mm->mmap_sem);
 	/*
 	 * This function tries to be smart and unmap shm segments that
 	 * were modified by partial mlock or munmap calls:
 	 * - It first determines the size of the shm segment that should be
 	 *   unmapped: It searches for a vma that is backed by shm and that
 	 *   started at address shmaddr. It records it's size and then unmaps
 	 *   it.
 	 * - Then it unmaps all shm vmas that started at shmaddr and that
 	 *   are within the initially determined size.
 	 * Errors from do_munmap are ignored: the function only fails if
 	 * it's called with invalid parameters or if it's called to unmap
 	 * a part of a vma. Both calls in this function are for full vmas,
 	 * the parameters are directly copied from the vma itself and always
 	 * valid - therefore do_munmap cannot fail. (famous last words?)
 	 */
 	/*
 	 * If it had been mremap()'d, the starting address would not
 	 * match the usual checks anyway. So assume all vma's are
 	 * above the starting address given.
 	 */
 	vma = find_vma(mm, addr);
 #ifdef CONFIG_MMU
 	while (vma) {
 		next = vma->vm_next;
 		/*
 		 * Check if the starting address would match, i.e. it's
 		 * a fragment created by mprotect() and/or munmap(), or it
 		 * otherwise it starts at this address with no hassles.
 		 */
 		if ((vma->vm_ops == &shm_vm_ops) &&
 			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
 			size = file_inode(vma->vm_file)->i_size;
 			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
 			/*
 			 * We discovered the size of the shm segment, so
 			 * break out of here and fall through to the next
 			 * loop that uses the size information to stop
 			 * searching for matching vma's.
 			 */
 			retval = 0;
 			vma = next;
 			break;
 		}
 		vma = next;
 	}
 	/*
 	 * We need look no further than the maximum address a fragment
 	 * could possibly have landed at. Also cast things to loff_t to
 	 * prevent overflows and make comparisons vs. equal-width types.
 	 */
 	size = PAGE_ALIGN(size);
 	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
 		next = vma->vm_next;
 		/* finding a matching vma now does not alter retval */
 		if ((vma->vm_ops == &shm_vm_ops) &&
 			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)
 			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
 		vma = next;
 	}
 #else /* CONFIG_MMU */
 	/* under NOMMU conditions, the exact address to be destroyed must be
 	 * given */
 	retval = -EINVAL;
 	if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
 		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
 		retval = 0;
 	}
 #endif
 	up_write(&mm->mmap_sem);
 	return retval;
 }
 #ifdef CONFIG_PROC_FS
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
 	struct shmid_kernel *shp = it;
 	unsigned long rss = 0, swp = 0;
 	shm_add_rss_swap(shp, &rss, &swp);
 #if BITS_PER_LONG <= 32
 #define SIZE_SPEC "%10lu"
 #else
 #define SIZE_SPEC "%21lu"
 #endif
 	return seq_printf(s,
 			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
 			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
 			  SIZE_SPEC " " SIZE_SPEC "\n",
 			  shp->shm_perm.key,
 			  shp->shm_perm.id,
 			  shp->shm_perm.mode,
 			  shp->shm_segsz,
 			  shp->shm_cprid,
 			  shp->shm_lprid,
 			  shp->shm_nattch,
 			  from_kuid_munged(user_ns, shp->shm_perm.uid),
 			  from_kgid_munged(user_ns, shp->shm_perm.gid),
 			  from_kuid_munged(user_ns, shp->shm_perm.cuid),
 			  from_kgid_munged(user_ns, shp->shm_perm.cgid),
 			  shp->shm_atim,
 			  shp->shm_dtim,
 			  shp->shm_ctim,
 			  rss * PAGE_SIZE,
 			  swp * PAGE_SIZE);
 }
 #endif

 /*
  * mm/mmap.c
  *
  * Written by obz.
  *
  * Address space accounting code	<alan@lxorguk.ukuu.org.uk>
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
 #include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/sched/sysctl.h>
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 #include "internal.h"
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
 #ifndef arch_rebalance_pgtables
 #define arch_rebalance_pgtables(addr, len)		(addr)
 #endif
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
  *
  * map_type	prot
  *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
  * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
  *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
  *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
  *
  * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
  *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
  *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
  *
  */
 pgprot_t protection_map[16] = {
 	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
 	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
 };
 pgprot_t vm_get_page_prot(unsigned long vm_flags)
 {
 	return __pgprot(pgprot_val(protection_map[vm_flags &
 				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
 			pgprot_val(arch_vm_get_page_prot(vm_flags)));
 }
 EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
  */
 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
 /*
  * The global memory commitment made in the system can be a metric
  * that can be used to drive ballooning decisions when Linux is hosted
  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
  * balancing memory across competing virtual machines that are hosted.
  * Several metrics drive this policy engine including the guest reported
  * memory commitment.
  */
 unsigned long vm_memory_committed(void)
 {
 	return percpu_counter_read_positive(&vm_committed_as);
 }
 EXPORT_SYMBOL_GPL(vm_memory_committed);
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to
  * succeed and -ENOMEM implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
  *
  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
  *
  * Note this is a helper function intended to be used by LSMs which
  * wish to use this logic.
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed, reserve;
 	vm_acct_memory(pages);
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
 	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
 		free = global_page_state(NR_FREE_PAGES);
 		free += global_page_state(NR_FILE_PAGES);
 		/*
 		 * shmem pages shouldn't be counted as free in this
 		 * case, they can't be purged, only swapped out, and
 		 * that won't affect the overall amount of available
 		 * memory in the system.
 		 */
 		free -= global_page_state(NR_SHMEM);
 		free += get_nr_swap_pages();
 		/*
 		 * Any slabs which are created with the
 		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
 		 * which are reclaimable, under pressure.  The dentry
 		 * cache and most inode caches should fall into this
 		 */
 		free += global_page_state(NR_SLAB_RECLAIMABLE);
 		/*
 		 * Leave reserved pages. The pages are not for anonymous pages.
 		 */
 		if (free <= totalreserve_pages)
 			goto error;
 		else
 			free -= totalreserve_pages;
 		/*
 		 * Reserve some for root
 		 */
 		if (!cap_sys_admin)
 			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 		if (free > pages)
 			return 0;
 		goto error;
 	}
 	allowed = (totalram_pages - hugetlb_total_pages())
 	       	* sysctl_overcommit_ratio / 100;
 	/*
 	 * Reserve some for root
 	 */
 	if (!cap_sys_admin)
 		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 	allowed += total_swap_pages;
 	/*
 	 * Don't let a single process grow so big a user can't recover
 	 */
 	if (mm) {
 		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
 		allowed -= min(mm->total_vm / 32, reserve);
 	}
 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
 error:
 	vm_unacct_memory(pages);
 	return -ENOMEM;
 }
 /*
  * Requires inode->i_mapping->i_mmap_mutex
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
 {
 	if (vma->vm_flags & VM_DENYWRITE)
 		atomic_inc(&file_inode(file)->i_writecount);
 	if (vma->vm_flags & VM_SHARED)
 		mapping->i_mmap_writable--;
 	flush_dcache_mmap_lock(mapping);
 	if (unlikely(vma->vm_flags & VM_NONLINEAR))
 		list_del_init(&vma->shared.nonlinear);
 	else
 		vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 /*
  * Unlink a file-based vm structure from its interval tree, to hide
  * vma from rmap and vmtruncate before freeing its page tables.
  */
 void unlink_file_vma(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		mutex_lock(&mapping->i_mmap_mutex);
 		__remove_shared_vm_struct(vma, file, mapping);
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
 }
 /*
  * Close a vm structure and free it, returning the next.
  */
 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 {
 	struct vm_area_struct *next = vma->vm_next;
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
 static unsigned long do_brk(unsigned long addr, unsigned long len);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long rlim, retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
 	unsigned long min_brk;
 	bool populate;
 	down_write(&mm->mmap_sem);
 #ifdef CONFIG_COMPAT_BRK
 	/*
 	 * CONFIG_COMPAT_BRK can still be overridden by setting
 	 * randomize_va_space to 2, which will still cause mm->start_brk
 	 * to be arbitrarily shifted
 	 */
 	if (current->brk_randomized)
 		min_brk = mm->start_brk;
 	else
 		min_brk = mm->end_data;
 #else
 	min_brk = mm->start_brk;
 #endif
 	if (brk < min_brk)
 		goto out;
 	/*
 	 * Check against rlimit here. If this check is done later after the test
 	 * of oldbrk with newbrk then it can escape the test and let the data
 	 * segment grow beyond its set limit the in case where the limit is
 	 * not page aligned -Ram Gupta
 	 */
 	rlim = rlimit(RLIMIT_DATA);
 	if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
 			(mm->end_data - mm->start_data) > rlim)
 		goto out;
 	newbrk = PAGE_ALIGN(brk);
 	oldbrk = PAGE_ALIGN(mm->brk);
 	if (oldbrk == newbrk)
 		goto set_brk;
 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
 		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 			goto set_brk;
 		goto out;
 	}
 	/* Check against existing mmap mappings. */
 	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 		goto out;
 	/* Ok, looks good - let it rip. */
 	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 		goto out;
 set_brk:
 	mm->brk = brk;
 	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(oldbrk, newbrk - oldbrk);
 	return brk;
 out:
 	retval = mm->brk;
 	up_write(&mm->mmap_sem);
 	return retval;
 }
 static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 {
 	unsigned long max, subtree_gap;
 	max = vma->vm_start;
 	if (vma->vm_prev)
 		max -= vma->vm_prev->vm_end;
 	if (vma->vm_rb.rb_left) {
 		subtree_gap = rb_entry(vma->vm_rb.rb_left,
 				struct vm_area_struct, vm_rb)->rb_subtree_gap;
 		if (subtree_gap > max)
 			max = subtree_gap;
 	}
 	if (vma->vm_rb.rb_right) {
 		subtree_gap = rb_entry(vma->vm_rb.rb_right,
 				struct vm_area_struct, vm_rb)->rb_subtree_gap;
 		if (subtree_gap > max)
 			max = subtree_gap;
 	}
 	return max;
 }
 #ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
 	int i = 0, j, bug = 0;
 	struct rb_node *nd, *pn = NULL;
 	unsigned long prev = 0, pend = 0;
 	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 		struct vm_area_struct *vma;
 		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 		if (vma->vm_start < prev) {
 			printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
 			bug = 1;
 		}
 		if (vma->vm_start < pend) {
 			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
 			bug = 1;
 		}
 		if (vma->vm_start > vma->vm_end) {
 			printk("vm_end %lx < vm_start %lx\n",
 				vma->vm_end, vma->vm_start);
 			bug = 1;
 		}
 		if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
 			printk("free gap %lx, correct %lx\n",
 			       vma->rb_subtree_gap,
 			       vma_compute_subtree_gap(vma));
 			bug = 1;
 		}
 		i++;
 		pn = nd;
 		prev = vma->vm_start;
 		pend = vma->vm_end;
 	}
 	j = 0;
 	for (nd = pn; nd; nd = rb_prev(nd))
 		j++;
 	if (i != j) {
 		printk("backwards %d, forwards %d\n", j, i);
 		bug = 1;
 	}
 	return bug ? -1 : i;
 }
 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
 {
 	struct rb_node *nd;
 	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 		struct vm_area_struct *vma;
 		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 		BUG_ON(vma != ignore &&
 		       vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
 	}
 }
 void validate_mm(struct mm_struct *mm)
 {
 	int bug = 0;
 	int i = 0;
 	unsigned long highest_address = 0;
 	struct vm_area_struct *vma = mm->mmap;
 	while (vma) {
 		struct anon_vma_chain *avc;
 		vma_lock_anon_vma(vma);
 		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 			anon_vma_interval_tree_verify(avc);
 		vma_unlock_anon_vma(vma);
 		highest_address = vma->vm_end;
 		vma = vma->vm_next;
 		i++;
 	}
 	if (i != mm->map_count) {
 		printk("map_count %d vm_next %d\n", mm->map_count, i);
 		bug = 1;
 	}
 	if (highest_address != mm->highest_vm_end) {
 		printk("mm->highest_vm_end %lx, found %lx\n",
 		       mm->highest_vm_end, highest_address);
 		bug = 1;
 	}
 	i = browse_rb(&mm->mm_rb);
 	if (i != mm->map_count) {
 		printk("map_count %d rb %d\n", mm->map_count, i);
 		bug = 1;
 	}
 	BUG_ON(bug);
 }
 #else
 #define validate_mm_rb(root, ignore) do { } while (0)
 #define validate_mm(mm) do { } while (0)
 #endif
 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
 		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
 /*
  * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
  * vma->vm_prev->vm_end values changed, without modifying the vma's position
  * in the rbtree.
  */
 static void vma_gap_update(struct vm_area_struct *vma)
 {
 	/*
 	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
 	 * function that does exacltly what we want.
 	 */
 	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
 }
 static inline void vma_rb_insert(struct vm_area_struct *vma,
 				 struct rb_root *root)
 {
 	/* All rb_subtree_gap values must be consistent prior to insertion */
 	validate_mm_rb(root, NULL);
 	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
 {
 	/*
 	 * All rb_subtree_gap values must be consistent prior to erase,
 	 * with the possible exception of the vma being erased.
 	 */
 	validate_mm_rb(root, vma);
 	/*
 	 * Note rb_erase_augmented is a fairly large inline function,
 	 * so make sure we instantiate it only once with our desired
 	 * augmented rbtree callbacks.
 	 */
 	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 /*
  * vma has some anon_vma assigned, and is already inserted on that
  * anon_vma's interval trees.
  *
  * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
  * vma must be removed from the anon_vma's interval trees using
  * anon_vma_interval_tree_pre_update_vma().
  *
  * After the update, the vma will be reinserted using
  * anon_vma_interval_tree_post_update_vma().
  *
  * The entire update must be protected by exclusive mmap_sem and by
  * the root anon_vma's mutex.
  */
 static inline void
 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma_chain *avc;
 	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
 }
 static inline void
 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma_chain *avc;
 	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
 }
 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 		unsigned long end, struct vm_area_struct **pprev,
 		struct rb_node ***rb_link, struct rb_node **rb_parent)
 {
 	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 	__rb_link = &mm->mm_rb.rb_node;
 	rb_prev = __rb_parent = NULL;
 	while (*__rb_link) {
 		struct vm_area_struct *vma_tmp;
 		__rb_parent = *__rb_link;
 		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 		if (vma_tmp->vm_end > addr) {
 			/* Fail if an existing vma overlaps the area */
 			if (vma_tmp->vm_start < end)
 				return -ENOMEM;
 			__rb_link = &__rb_parent->rb_left;
 		} else {
 			rb_prev = __rb_parent;
 			__rb_link = &__rb_parent->rb_right;
 		}
 	}
 	*pprev = NULL;
 	if (rb_prev)
 		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 	*rb_link = __rb_link;
 	*rb_parent = __rb_parent;
 	return 0;
 }
 static unsigned long count_vma_pages_range(struct mm_struct *mm,
 		unsigned long addr, unsigned long end)
 {
 	unsigned long nr_pages = 0;
 	struct vm_area_struct *vma;
 	/* Find first overlaping mapping */
 	vma = find_vma_intersection(mm, addr, end);
 	if (!vma)
 		return 0;
 	nr_pages = (min(end, vma->vm_end) -
 		max(addr, vma->vm_start)) >> PAGE_SHIFT;
 	/* Iterate over the rest of the overlaps */
 	for (vma = vma->vm_next; vma; vma = vma->vm_next) {
 		unsigned long overlap_len;
 		if (vma->vm_start > end)
 			break;
 		overlap_len = min(end, vma->vm_end) - vma->vm_start;
 		nr_pages += overlap_len >> PAGE_SHIFT;
 	}
 	return nr_pages;
 }
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct rb_node **rb_link, struct rb_node *rb_parent)
 {
 	/* Update tracking information for the gap following the new vma. */
 	if (vma->vm_next)
 		vma_gap_update(vma->vm_next);
 	else
 		mm->highest_vm_end = vma->vm_end;
 	/*
 	 * vma->vm_prev wasn't known when we followed the rbtree to find the
 	 * correct insertion point for that vma. As a result, we could not
 	 * update the vma vm_rb parents rb_subtree_gap values on the way down.
 	 * So, we first insert the vma with a zero rb_subtree_gap value
 	 * (to be consistent with what we did on the way down), and then
 	 * immediately update the gap to the correct value. Finally we
 	 * rebalance the rbtree after all augmented values have been set.
 	 */
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	vma->rb_subtree_gap = 0;
 	vma_gap_update(vma);
 	vma_rb_insert(vma, &mm->mm_rb);
 }
 static void __vma_link_file(struct vm_area_struct *vma)
 {
 	struct file *file;
 	file = vma->vm_file;
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_dec(&file_inode(file)->i_writecount);
 		if (vma->vm_flags & VM_SHARED)
 			mapping->i_mmap_writable++;
 		flush_dcache_mmap_lock(mapping);
 		if (unlikely(vma->vm_flags & VM_NONLINEAR))
 			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		else
 			vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
 static void
 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *prev, struct rb_node **rb_link,
 	struct rb_node *rb_parent)
 {
 	__vma_link_list(mm, vma, prev, rb_parent);
 	__vma_link_rb(mm, vma, rb_link, rb_parent);
 }
 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct vm_area_struct *prev, struct rb_node **rb_link,
 			struct rb_node *rb_parent)
 {
 	struct address_space *mapping = NULL;
 	if (vma->vm_file)
 		mapping = vma->vm_file->f_mapping;
 	if (mapping)
 		mutex_lock(&mapping->i_mmap_mutex);
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 	mm->map_count++;
 	validate_mm(mm);
 }
 /*
  * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
  * mm's list and rbtree.  It has already been inserted into the interval tree.
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	struct vm_area_struct *prev;
 	struct rb_node **rb_link, *rb_parent;
 	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
 			   &prev, &rb_link, &rb_parent))
 		BUG();
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	mm->map_count++;
 }
 static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
 {
 	struct vm_area_struct *next;
 	vma_rb_erase(vma, &mm->mm_rb);
 	prev->vm_next = next = vma->vm_next;
 	if (next)
 		next->vm_prev = prev;
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
 }
 /*
  * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  * is already present in an i_mmap tree without adjusting the tree.
  * The following helper function should be used when such adjustments
  * are necessary.  The "insert" vma (if any) is to be inserted
  * before we drop the necessary locks.
  */
 int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma->vm_next;
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
 	struct rb_root *root = NULL;
 	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	bool start_changed = false, end_changed = false;
 	long adjust_next = 0;
 	int remove_next = 0;
 	if (next && !insert) {
 		struct vm_area_struct *exporter = NULL;
 		if (end >= next->vm_end) {
 			/*
 			 * vma expands, overlapping all the next, and
 			 * perhaps the one after too (mprotect case 6).
 			 */
 again:			remove_next = 1 + (end > next->vm_end);
 			end = next->vm_end;
 			exporter = next;
 			importer = vma;
 		} else if (end > next->vm_start) {
 			/*
 			 * vma expands, overlapping part of the next:
 			 * mprotect case 5 shifting the boundary up.
 			 */
 			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
 			exporter = next;
 			importer = vma;
 		} else if (end < vma->vm_end) {
 			/*
 			 * vma shrinks, and !insert tells it's not
 			 * split_vma inserting another: so it must be
 			 * mprotect case 4 shifting the boundary down.
 			 */
 			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
 			exporter = vma;
 			importer = next;
 		}
 		/*
 		 * Easily overlooked: when mprotect shifts the boundary,
 		 * make sure the expanding vma has anon_vma set if the
 		 * shrinking vma had, to cover any anon pages imported.
 		 */
 		if (exporter && exporter->anon_vma && !importer->anon_vma) {
 			if (anon_vma_clone(importer, exporter))
 				return -ENOMEM;
 			importer->anon_vma = exporter->anon_vma;
 		}
 	}
 	if (file) {
 		mapping = file->f_mapping;
 		if (!(vma->vm_flags & VM_NONLINEAR)) {
 			root = &mapping->i_mmap;
 			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 			if (adjust_next)
 				uprobe_munmap(next, next->vm_start,
 							next->vm_end);
 		}
 		mutex_lock(&mapping->i_mmap_mutex);
 		if (insert) {
 			/*
 			 * Put into interval tree now, so instantiated pages
 			 * are visible to arm/parisc __flush_dcache_page
 			 * throughout; but we cannot insert into address
 			 * space until vma start or end is updated.
 			 */
 			__vma_link_file(insert);
 		}
 	}
 	vma_adjust_trans_huge(vma, start, end, adjust_next);
 	anon_vma = vma->anon_vma;
 	if (!anon_vma && adjust_next)
 		anon_vma = next->anon_vma;
 	if (anon_vma) {
 		VM_BUG_ON(adjust_next && next->anon_vma &&
 			  anon_vma != next->anon_vma);
 		anon_vma_lock_write(anon_vma);
 		anon_vma_interval_tree_pre_update_vma(vma);
 		if (adjust_next)
 			anon_vma_interval_tree_pre_update_vma(next);
 	}
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_interval_tree_remove(vma, root);
 		if (adjust_next)
 			vma_interval_tree_remove(next, root);
 	}
 	if (start != vma->vm_start) {
 		vma->vm_start = start;
 		start_changed = true;
 	}
 	if (end != vma->vm_end) {
 		vma->vm_end = end;
 		end_changed = true;
 	}
 	vma->vm_pgoff = pgoff;
 	if (adjust_next) {
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
 	}
 	if (root) {
 		if (adjust_next)
 			vma_interval_tree_insert(next, root);
 		vma_interval_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
 	if (remove_next) {
 		/*
 		 * vma_merge has merged next into vma, and needs
 		 * us to remove next before dropping the locks.
 		 */
 		__vma_unlink(mm, next, vma);
 		if (file)
 			__remove_shared_vm_struct(next, file, mapping);
 	} else if (insert) {
 		/*
 		 * split_vma has split insert from vma, and needs
 		 * us to insert it before dropping the locks
 		 * (it may either follow vma or precede it).
 		 */
 		__insert_vm_struct(mm, insert);
 	} else {
 		if (start_changed)
 			vma_gap_update(vma);
 		if (end_changed) {
 			if (!next)
 				mm->highest_vm_end = end;
 			else if (!adjust_next)
 				vma_gap_update(next);
 		}
 	}
 	if (anon_vma) {
 		anon_vma_interval_tree_post_update_vma(vma);
 		if (adjust_next)
 			anon_vma_interval_tree_post_update_vma(next);
 		anon_vma_unlock_write(anon_vma);
 	}
 	if (mapping)
 		mutex_unlock(&mapping->i_mmap_mutex);
 	if (root) {
 		uprobe_mmap(vma);
 		if (adjust_next)
 			uprobe_mmap(next);
 	}
 	if (remove_next) {
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
 			fput(file);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		vma_set_policy(vma, vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
 		 * up the code too much to do both in one go.
 		 */
 		next = vma->vm_next;
 		if (remove_next == 2)
 			goto again;
 		else if (next)
 			vma_gap_update(next);
 		else
 			mm->highest_vm_end = end;
 	}
 	if (insert && file)
 		uprobe_mmap(insert);
 	validate_mm(mm);
 	return 0;
 }
 /*
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
 	if (vma->vm_flags ^ vm_flags)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
 	return 1;
 }
 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
 					struct anon_vma *anon_vma2,
 					struct vm_area_struct *vma)
 {
 	/*
 	 * The list_is_singular() test is to avoid merging VMA cloned from
 	 * parents. This can improve scalability caused by anon_vma lock.
 	 */
 	if ((!anon_vma1 || !anon_vma2) && (!vma ||
 		list_is_singular(&vma->anon_vma_chain)))
 		return 1;
 	return anon_vma1 == anon_vma2;
 }
 /*
  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  * in front of (at a lower virtual address and file offset than) the vma.
  *
  * We cannot merge two vmas if they have differently assigned (non-NULL)
  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  *
  * We don't check here for the merged mmap wrapping around the end of pagecache
  * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  * wrap, nor mmaps which cover the final page at index -1UL.
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
 	if (is_mergeable_vma(vma, file, vm_flags) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
 	}
 	return 0;
 }
 /*
  * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  * beyond (at a higher virtual address and file offset than) the vma.
  *
  * We cannot merge two vmas if they have differently assigned (non-NULL)
  * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
 	if (is_mergeable_vma(vma, file, vm_flags) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
 			return 1;
 	}
 	return 0;
 }
 /*
  * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  * whether that can be merged with its predecessor or its successor.
  * Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
  * called for mprotect, it is certain to be already mapped (either at
  * an offset within prev, or at the start of next), and the flags of
  * this area are about to be changed to vm_flags - and the no-change
  * case has already been eliminated.
  *
  * The following mprotect cases have to be considered, where AAAA is
  * the area passed down from mprotect_fixup, never extending beyond one
  * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  *
  *     AAAA             AAAA                AAAA          AAAA
  *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  *    cannot merge    might become    might become    might become
  *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  *    mremap move:                                    PPPPNNNNNNNN 8
  *        AAAA
  *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  *    might become    case 1 below    case 2 below    case 3 below
  *
  * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  */
 struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 		     	struct anon_vma *anon_vma, struct file *file,
 			pgoff_t pgoff, struct mempolicy *policy)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
 	int err;
 	/*
 	 * We later require that vma->vm_flags == vm_flags,
 	 * so this tests vma->vm_flags & VM_SPECIAL, too.
 	 */
 	if (vm_flags & VM_SPECIAL)
 		return NULL;
 	if (prev)
 		next = prev->vm_next;
 	else
 		next = mm->mmap;
 	area = next;
 	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
 		next = next->vm_next;
 	/*
 	 * Can it merge with the predecessor?
 	 */
 	if (prev && prev->vm_end == addr &&
   			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags,
 						anon_vma, file, pgoff)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
 				can_vma_merge_before(next, vm_flags,
 					anon_vma, file, pgoff+pglen) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
 			err = vma_adjust(prev, prev->vm_start,
 				next->vm_end, prev->vm_pgoff, NULL);
 		} else					/* cases 2, 5, 7 */
 			err = vma_adjust(prev, prev->vm_start,
 				end, prev->vm_pgoff, NULL);
 		if (err)
 			return NULL;
 		khugepaged_enter_vma_merge(prev);
 		return prev;
 	}
 	/*
 	 * Can this new request be merged in front of next?
 	 */
 	if (next && end == next->vm_start &&
  			mpol_equal(policy, vma_policy(next)) &&
 			can_vma_merge_before(next, vm_flags,
 					anon_vma, file, pgoff+pglen)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
 		else					/* cases 3, 8 */
 			err = vma_adjust(area, addr, next->vm_end,
 				next->vm_pgoff - pglen, NULL);
 		if (err)
 			return NULL;
 		khugepaged_enter_vma_merge(area);
 		return area;
 	}
 	return NULL;
 }
 /*
  * Rough compatbility check to quickly see if it's even worth looking
  * at sharing an anon_vma.
  *
  * They need to have the same vm_file, and the flags can only differ
  * in things that mprotect may change.
  *
  * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
  * we can merge the two vma's. For example, we refuse to merge a vma if
  * there is a vm_ops->close() function, because that indicates that the
  * driver is doing some kind of reference counting. But that doesn't
  * really matter for the anon_vma sharing case.
  */
 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
 {
 	return a->vm_end == b->vm_start &&
 		mpol_equal(vma_policy(a), vma_policy(b)) &&
 		a->vm_file == b->vm_file &&
 		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
 		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
 }
 /*
  * Do some basic sanity checking to see if we can re-use the anon_vma
  * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
  * the same as 'old', the other will be the new one that is trying
  * to share the anon_vma.
  *
  * NOTE! This runs with mm_sem held for reading, so it is possible that
  * the anon_vma of 'old' is concurrently in the process of being set up
  * by another page fault trying to merge _that_. But that's ok: if it
  * is being set up, that automatically means that it will be a singleton
  * acceptable for merging, so we can do all of this optimistically. But
  * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
  *
  * IOW: that the "list_is_singular()" test on the anon_vma_chain only
  * matters for the 'stable anon_vma' case (ie the thing we want to avoid
  * is to return an anon_vma that is "complex" due to having gone through
  * a fork).
  *
  * We also make sure that the two vma's are compatible (adjacent,
  * and with the same memory policies). That's all stable, even with just
  * a read lock on the mm_sem.
  */
 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
 {
 	if (anon_vma_compatible(a, b)) {
 		struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
 		if (anon_vma && list_is_singular(&old->anon_vma_chain))
 			return anon_vma;
 	}
 	return NULL;
 }
 /*
  * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  * neighbouring vmas for a suitable anon_vma, before it goes off
  * to allocate a new anon_vma.  It checks because a repetitive
  * sequence of mprotects and faults may otherwise lead to distinct
  * anon_vmas being allocated, preventing vma merge in subsequent
  * mprotect.
  */
 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *near;
 	near = vma->vm_next;
 	if (!near)
 		goto try_prev;
 	anon_vma = reusable_anon_vma(near, vma, near);
 	if (anon_vma)
 		return anon_vma;
 try_prev:
 	near = vma->vm_prev;
 	if (!near)
 		goto none;
 	anon_vma = reusable_anon_vma(near, near, vma);
 	if (anon_vma)
 		return anon_vma;
 none:
 	/*
 	 * There's no absolute need to look only at touching neighbours:
 	 * we could search further afield for "compatible" anon_vmas.
 	 * But it would probably just be a waste of time searching,
 	 * or lead to too many vmas hanging off the same anon_vma.
 	 * We're trying to allow mprotect remerging later on,
 	 * not trying to minimize memory used for anon_vmas.
 	 */
 	return NULL;
 }
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 						struct file *file, long pages)
 {
 	const unsigned long stack_flags
 		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
 	mm->total_vm += pages;
 	if (file) {
 		mm->shared_vm += pages;
 		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
 			mm->exec_vm += pages;
 	} else if (flags & stack_flags)
 		mm->stack_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
 /*
  * If a hint addr is less than mmap_min_addr change hint to be as
  * low as possible but still greater than mmap_min_addr
  */
 static inline unsigned long round_hint_to_min(unsigned long hint)
 {
 	hint &= PAGE_MASK;
 	if (((void *)hint != NULL) &&
 	    (hint < mmap_min_addr))
 		return PAGE_ALIGN(mmap_min_addr);
 	return hint;
 }
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, unsigned long pgoff,
 			unsigned long *populate)
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
 	vm_flags_t vm_flags;
 	*populate = 0;
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
 	 *
 	 * (the exception is when the underlying filesystem is noexec
 	 *  mounted, in which case we dont add PROT_EXEC.)
 	 */
 	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
 			prot |= PROT_EXEC;
 	if (!len)
 		return -EINVAL;
 	if (!(flags & MAP_FIXED))
 		addr = round_hint_to_min(addr);
 	/* Careful about overflows.. */
 	len = PAGE_ALIGN(len);
 	if (!len)
 		return -ENOMEM;
 	/* offset overflow? */
 	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;
 	/* Too many mappings? */
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
 	if (addr & ~PAGE_MASK)
 		return addr;
 	/* Do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 	if (flags & MAP_LOCKED)
 		if (!can_do_mlock())
 			return -EPERM;
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = len >> PAGE_SHIFT;
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 	inode = file ? file_inode(file) : NULL;
 	if (file) {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;
 			/*
 			 * Make sure we don't allow writing to an append-only
 			 * file..
 			 */
 			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
 				return -EACCES;
 			/*
 			 * Make sure there are no mandatory locks on the file.
 			 */
 			if (locks_verify_locked(inode))
 				return -EAGAIN;
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
 				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
 			/* fall through */
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
 			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
 				if (vm_flags & VM_EXEC)
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
 			}
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
 			break;
 		default:
 			return -EINVAL;
 		}
 	} else {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			/*
 			 * Ignore pgoff.
 			 */
 			pgoff = 0;
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			break;
 		case MAP_PRIVATE:
 			/*
 			 * Set pgoff according to addr for anon_vma.
 			 */
 			pgoff = addr >> PAGE_SHIFT;
 			break;
 		default:
 			return -EINVAL;
 		}
 	}
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
 	 * memory use of this mapping.
 	 */
 	if (flags & MAP_NORESERVE) {
 		/* We honor MAP_NORESERVE if allowed to overcommit */
 		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
 			vm_flags |= VM_NORESERVE;
 		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
 		if (file && is_file_hugepages(file))
 			vm_flags |= VM_NORESERVE;
 	}
 	addr = mmap_region(file, addr, len, vm_flags, pgoff);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
 		*populate = len;
 	return addr;
 }
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		unsigned long, prot, unsigned long, flags,
 		unsigned long, fd, unsigned long, pgoff)
 {
 	struct file *file = NULL;
 	unsigned long retval = -EBADF;
 	if (!(flags & MAP_ANONYMOUS)) {
 		audit_mmap_fd(fd, flags);
 		if (unlikely(flags & MAP_HUGETLB))
 			return -EINVAL;
 		file = fget(fd);
 		if (!file)
 			goto out;
+		if (is_file_hugepages(file))
+			len = ALIGN(len, huge_page_size(hstate_file(file)));
 	} else if (flags & MAP_HUGETLB) {
 		struct user_struct *user = NULL;
+		len = ALIGN(len, huge_page_size(hstate_sizelog(
+			(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
 		/*
 		 * VM_NORESERVE is used because the reservations will be
 		 * taken when vm_ops->mmap() is called
 		 * A dummy user value is used because we are not locking
 		 * memory so no accounting is necessary
 		 */
-		file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
 				VM_NORESERVE,
 				&user, HUGETLB_ANONHUGE_INODE,
 				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
 		if (IS_ERR(file))
 			return PTR_ERR(file);
 	}
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 	if (file)
 		fput(file);
 out:
 	return retval;
 }
 #ifdef __ARCH_WANT_SYS_OLD_MMAP
 struct mmap_arg_struct {
 	unsigned long addr;
 	unsigned long len;
 	unsigned long prot;
 	unsigned long flags;
 	unsigned long fd;
 	unsigned long offset;
 };
 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 {
 	struct mmap_arg_struct a;
 	if (copy_from_user(&a, arg, sizeof(a)))
 		return -EFAULT;
 	if (a.offset & ~PAGE_MASK)
 		return -EINVAL;
 	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
 			      a.offset >> PAGE_SHIFT);
 }
 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
 /*
  * Some shared mappigns will want the pages marked read-only
  * to track write events. If so, we'll downgrade vm_page_prot
  * to the private version (using protection_map[] without the
  * VM_SHARED bit).
  */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
 	vm_flags_t vm_flags = vma->vm_flags;
 	/* If it was private or non-writable, the write bit is already clear */
 	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
 		return 0;
 	/* The backer wishes to know when pages are first written to? */
 	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
 		return 1;
 	/* The open routine did something to the protections already? */
 	if (pgprot_val(vma->vm_page_prot) !=
 	    pgprot_val(vm_get_page_prot(vm_flags)))
 		return 0;
 	/* Specialty mapping? */
 	if (vm_flags & VM_PFNMAP)
 		return 0;
 	/* Can the mapping track the dirty pages? */
 	return vma->vm_file && vma->vm_file->f_mapping &&
 		mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
 /*
  * We account for memory if it's a private writeable mapping,
  * not hugepages and VM_NORESERVE wasn't set.
  */
 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
 	/*
 	 * hugetlb has its own accounting separate from the core VM
 	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
 	 */
 	if (file && is_file_hugepages(file))
 		return 0;
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
 	int correct_wcount = 0;
 	int error;
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
 	struct inode *inode =  file ? file_inode(file) : NULL;
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
 		unsigned long nr_pages;
 		/*
 		 * MAP_FIXED may remove pages of mappings that intersects with
 		 * requested mapping. Account for the pages it would unmap.
 		 */
 		if (!(vm_flags & MAP_FIXED))
 			return -ENOMEM;
 		nr_pages = count_vma_pages_range(mm, addr, addr + len);
 		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
 			return -ENOMEM;
 	}
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
 	}
 	/*
 	 * Private writable mapping: check memory availability
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
 		if (security_vm_enough_memory_mm(mm, charged))
 			return -ENOMEM;
 		vm_flags |= VM_ACCOUNT;
 	}
 	/*
 	 * Can we just expand an old mapping?
 	 */
 	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
 	if (vma)
 		goto out;
 	/*
 	 * Determine the object being mapped and call the appropriate
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
 	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
 	}
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
 	vma->vm_pgoff = pgoff;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	error = -EINVAL;	/* when rejecting VM_GROWSDOWN|VM_GROWSUP */
 	if (file) {
 		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 			goto free_vma;
 		if (vm_flags & VM_DENYWRITE) {
 			error = deny_write_access(file);
 			if (error)
 				goto free_vma;
 			correct_wcount = 1;
 		}
 		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
 		/* Can addr have changed??
 		 *
 		 * Answer: Yes, several device drivers can do it in their
 		 *         f_op->mmap method. -DaveM
 		 * Bug: If addr is changed, prev, rb_link, rb_parent should
 		 *      be updated for vma_link()
 		 */
 		WARN_ON_ONCE(addr != vma->vm_start);
 		addr = vma->vm_start;
 		pgoff = vma->vm_pgoff;
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
 		if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
 			goto free_vma;
 		error = shmem_zero_setup(vma);
 		if (error)
 			goto free_vma;
 	}
 	if (vma_wants_writenotify(vma)) {
 		pgprot_t pprot = vma->vm_page_prot;
 		/* Can vma->vm_page_prot have changed??
 		 *
 		 * Answer: Yes, drivers may have changed it in their
 		 *         f_op->mmap method.
 		 *
 		 * Ensures that vmas marked as uncached stay that way.
 		 */
 		vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
 		if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
 			vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	}
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 	file = vma->vm_file;
 	/* Once vma denies write, undo our temporary denial count */
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
 	perf_event_mmap(vma);
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm)))
 			mm->locked_vm += (len >> PAGE_SHIFT);
 		else
 			vma->vm_flags &= ~VM_LOCKED;
 	}
 	if (file)
 		uprobe_mmap(vma);
 	return addr;
 unmap_and_free_vma:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 	vma->vm_file = NULL;
 	fput(file);
 	/* Undo any partial mapping done by a device driver. */
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
 	return error;
 }
 unsigned long unmapped_area(struct vm_unmapped_area_info *info)
 {
 	/*
 	 * We implement the search by looking for an rbtree node that
 	 * immediately follows a suitable gap. That is,
 	 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
 	 * - gap_end   = vma->vm_start        >= info->low_limit  + length;
 	 * - gap_end - gap_start >= length
 	 */
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long length, low_limit, high_limit, gap_start, gap_end;
 	/* Adjust search length to account for worst case alignment overhead */
 	length = info->length + info->align_mask;
 	if (length < info->length)
 		return -ENOMEM;
 	/* Adjust search limits by the desired length */
 	if (info->high_limit < length)
 		return -ENOMEM;
 	high_limit = info->high_limit - length;
 	if (info->low_limit > high_limit)
 		return -ENOMEM;
 	low_limit = info->low_limit + length;
 	/* Check if rbtree root looks promising */
 	if (RB_EMPTY_ROOT(&mm->mm_rb))
 		goto check_highest;
 	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
 	if (vma->rb_subtree_gap < length)
 		goto check_highest;
 	while (true) {
 		/* Visit left subtree if it looks promising */
 		gap_end = vma->vm_start;
 		if (gap_end >= low_limit && vma->vm_rb.rb_left) {
 			struct vm_area_struct *left =
 				rb_entry(vma->vm_rb.rb_left,
 					 struct vm_area_struct, vm_rb);
 			if (left->rb_subtree_gap >= length) {
 				vma = left;
 				continue;
 			}
 		}
 		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
 check_current:
 		/* Check if current node has a suitable gap */
 		if (gap_start > high_limit)
 			return -ENOMEM;
 		if (gap_end >= low_limit && gap_end - gap_start >= length)
 			goto found;
 		/* Visit right subtree if it looks promising */
 		if (vma->vm_rb.rb_right) {
 			struct vm_area_struct *right =
 				rb_entry(vma->vm_rb.rb_right,
 					 struct vm_area_struct, vm_rb);
 			if (right->rb_subtree_gap >= length) {
 				vma = right;
 				continue;
 			}
 		}
 		/* Go back up the rbtree to find next candidate node */
 		while (true) {
 			struct rb_node *prev = &vma->vm_rb;
 			if (!rb_parent(prev))
 				goto check_highest;
 			vma = rb_entry(rb_parent(prev),
 				       struct vm_area_struct, vm_rb);
 			if (prev == vma->vm_rb.rb_left) {
 				gap_start = vma->vm_prev->vm_end;
 				gap_end = vma->vm_start;
 				goto check_current;
 			}
 		}
 	}
 check_highest:
 	/* Check highest gap, which does not precede any rbtree node */
 	gap_start = mm->highest_vm_end;
 	gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
 	if (gap_start > high_limit)
 		return -ENOMEM;
 found:
 	/* We found a suitable gap. Clip it with the original low_limit. */
 	if (gap_start < info->low_limit)
 		gap_start = info->low_limit;
 	/* Adjust gap address to the desired alignment */
 	gap_start += (info->align_offset - gap_start) & info->align_mask;
 	VM_BUG_ON(gap_start + info->length > info->high_limit);
 	VM_BUG_ON(gap_start + info->length > gap_end);
 	return gap_start;
 }
 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long length, low_limit, high_limit, gap_start, gap_end;
 	/* Adjust search length to account for worst case alignment overhead */
 	length = info->length + info->align_mask;
 	if (length < info->length)
 		return -ENOMEM;
 	/*
 	 * Adjust search limits by the desired length.
 	 * See implementation comment at top of unmapped_area().
 	 */
 	gap_end = info->high_limit;
 	if (gap_end < length)
 		return -ENOMEM;
 	high_limit = gap_end - length;
 	if (info->low_limit > high_limit)
 		return -ENOMEM;
 	low_limit = info->low_limit + length;
 	/* Check highest gap, which does not precede any rbtree node */
 	gap_start = mm->highest_vm_end;
 	if (gap_start <= high_limit)
 		goto found_highest;
 	/* Check if rbtree root looks promising */
 	if (RB_EMPTY_ROOT(&mm->mm_rb))
 		return -ENOMEM;
 	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
 	if (vma->rb_subtree_gap < length)
 		return -ENOMEM;
 	while (true) {
 		/* Visit right subtree if it looks promising */
 		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
 		if (gap_start <= high_limit && vma->vm_rb.rb_right) {
 			struct vm_area_struct *right =
 				rb_entry(vma->vm_rb.rb_right,
 					 struct vm_area_struct, vm_rb);
 			if (right->rb_subtree_gap >= length) {
 				vma = right;
 				continue;
 			}
 		}
 check_current:
 		/* Check if current node has a suitable gap */
 		gap_end = vma->vm_start;
 		if (gap_end < low_limit)
 			return -ENOMEM;
 		if (gap_start <= high_limit && gap_end - gap_start >= length)
 			goto found;
 		/* Visit left subtree if it looks promising */
 		if (vma->vm_rb.rb_left) {
 			struct vm_area_struct *left =
 				rb_entry(vma->vm_rb.rb_left,
 					 struct vm_area_struct, vm_rb);
 			if (left->rb_subtree_gap >= length) {
 				vma = left;
 				continue;
 			}
 		}
 		/* Go back up the rbtree to find next candidate node */
 		while (true) {
 			struct rb_node *prev = &vma->vm_rb;
 			if (!rb_parent(prev))
 				return -ENOMEM;
 			vma = rb_entry(rb_parent(prev),
 				       struct vm_area_struct, vm_rb);
 			if (prev == vma->vm_rb.rb_right) {
 				gap_start = vma->vm_prev ?
 					vma->vm_prev->vm_end : 0;
 				goto check_current;
 			}
 		}
 	}
 found:
 	/* We found a suitable gap. Clip it with the original high_limit. */
 	if (gap_end > info->high_limit)
 		gap_end = info->high_limit;
 found_highest:
 	/* Compute highest gap address at the desired alignment */
 	gap_end -= info->length;
 	gap_end -= (gap_end - info->align_offset) & info->align_mask;
 	VM_BUG_ON(gap_end < info->low_limit);
 	VM_BUG_ON(gap_end < gap_start);
 	return gap_end;
 }
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
  * Ugly calling convention alert:
  * Return value with the low bits set means error value,
  * ie
  *	if (ret & ~PAGE_MASK)
  *		error = ret;
  *
  * This function "knows" that -ENOMEM has the bits set.
  */
 #ifndef HAVE_ARCH_UNMAPPED_AREA
 unsigned long
 arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 	if (flags & MAP_FIXED)
 		return addr;
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = TASK_UNMAPPED_BASE;
 	info.high_limit = TASK_SIZE;
 	info.align_mask = 0;
 	return vm_unmapped_area(&info);
 }
 #endif
 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
 	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
 		mm->free_area_cache = addr;
 }
 /*
  * This mmap-allocator allocates new areas top-down from below the
  * stack's low limit (the base):
  */
 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			  const unsigned long len, const unsigned long pgoff,
 			  const unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 	if (flags & MAP_FIXED)
 		return addr;
 	/* requesting a specific address */
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = mm->mmap_base;
 	info.align_mask = 0;
 	addr = vm_unmapped_area(&info);
 	/*
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
 	if (addr & ~PAGE_MASK) {
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
 		info.high_limit = TASK_SIZE;
 		addr = vm_unmapped_area(&info);
 	}
 	return addr;
 }
 #endif
 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
 	if (addr > mm->free_area_cache)
 		mm->free_area_cache = addr;
 	/* dont allow allocations above current base */
 	if (mm->free_area_cache > mm->mmap_base)
 		mm->free_area_cache = mm->mmap_base;
 }
 unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
 	unsigned long error = arch_mmap_check(addr, len, flags);
 	if (error)
 		return error;
 	/* Careful about overflows.. */
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 	get_area = current->mm->get_unmapped_area;
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
 	if (IS_ERR_VALUE(addr))
 		return addr;
 	if (addr > TASK_SIZE - len)
 		return -ENOMEM;
 	if (addr & ~PAGE_MASK)
 		return -EINVAL;
 	addr = arch_rebalance_pgtables(addr, len);
 	error = security_mmap_addr(addr);
 	return error ? error : addr;
 }
 EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma = NULL;
 	/* Check the cache first. */
 	/* (Cache hit rate is typically around 35%.) */
 	vma = ACCESS_ONCE(mm->mmap_cache);
 	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 		struct rb_node *rb_node;
 		rb_node = mm->mm_rb.rb_node;
 		vma = NULL;
 		while (rb_node) {
 			struct vm_area_struct *vma_tmp;
 			vma_tmp = rb_entry(rb_node,
 					   struct vm_area_struct, vm_rb);
 			if (vma_tmp->vm_end > addr) {
 				vma = vma_tmp;
 				if (vma_tmp->vm_start <= addr)
 					break;
 				rb_node = rb_node->rb_left;
 			} else
 				rb_node = rb_node->rb_right;
 		}
 		if (vma)
 			mm->mmap_cache = vma;
 	}
 	return vma;
 }
 EXPORT_SYMBOL(find_vma);
 /*
  * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
  */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 			struct vm_area_struct **pprev)
 {
 	struct vm_area_struct *vma;
 	vma = find_vma(mm, addr);
 	if (vma) {
 		*pprev = vma->vm_prev;
 	} else {
 		struct rb_node *rb_node = mm->mm_rb.rb_node;
 		*pprev = NULL;
 		while (rb_node) {
 			*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 			rb_node = rb_node->rb_right;
 		}
 	}
 	return vma;
 }
 /*
  * Verify that the stack growth is acceptable and
  * update accounting. This is shared with both the
  * grow-up and grow-down cases.
  */
 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct rlimit *rlim = current->signal->rlim;
 	unsigned long new_start;
 	/* address space limit tests */
 	if (!may_expand_vm(mm, grow))
 		return -ENOMEM;
 	/* Stack limit test */
 	if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
 		return -ENOMEM;
 	/* mlock limit tests */
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked;
 		unsigned long limit;
 		locked = mm->locked_vm + grow;
 		limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
 		limit >>= PAGE_SHIFT;
 		if (locked > limit && !capable(CAP_IPC_LOCK))
 			return -ENOMEM;
 	}
 	/* Check to ensure the stack will not grow into a hugetlb-only region */
 	new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
 			vma->vm_end - size;
 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
 		return -EFAULT;
 	/*
 	 * Overcommit..  This must be the final test, as it will
 	 * update security statistics.
 	 */
 	if (security_vm_enough_memory_mm(mm, grow))
 		return -ENOMEM;
 	/* Ok, everything looks good - let it rip */
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
 /*
  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
  * vma is the last one with address > vma->vm_end.  Have to extend vma.
  */
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
 	int error;
 	if (!(vma->vm_flags & VM_GROWSUP))
 		return -EFAULT;
 	/*
 	 * We must make sure the anon_vma is allocated
 	 * so that the anon_vma locking is not a noop.
 	 */
 	if (unlikely(anon_vma_prepare(vma)))
 		return -ENOMEM;
 	vma_lock_anon_vma(vma);
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode.  We need the
 	 * anon_vma lock to serialize against concurrent expand_stacks.
 	 * Also guard against wrapping around to address 0.
 	 */
 	if (address < PAGE_ALIGN(address+4))
 		address = PAGE_ALIGN(address+4);
 	else {
 		vma_unlock_anon_vma(vma);
 		return -ENOMEM;
 	}
 	error = 0;
 	/* Somebody else might have raced and expanded it already */
 	if (address > vma->vm_end) {
 		unsigned long size, grow;
 		size = address - vma->vm_start;
 		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 		error = -ENOMEM;
 		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
 				/*
 				 * vma_gap_update() doesn't support concurrent
 				 * updates, but we only hold a shared mmap_sem
 				 * lock here, so we need to protect against
 				 * concurrent vma expansions.
 				 * vma_lock_anon_vma() doesn't help here, as
 				 * we don't guarantee that all growable vmas
 				 * in a mm share the same root anon vma.
 				 * So, we reuse mm->page_table_lock to guard
 				 * against concurrent vma expansions.
 				 */
 				spin_lock(&vma->vm_mm->page_table_lock);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_end = address;
 				anon_vma_interval_tree_post_update_vma(vma);
 				if (vma->vm_next)
 					vma_gap_update(vma->vm_next);
 				else
 					vma->vm_mm->highest_vm_end = address;
 				spin_unlock(&vma->vm_mm->page_table_lock);
 				perf_event_mmap(vma);
 			}
 		}
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
 	validate_mm(vma->vm_mm);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
 int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
 	int error;
 	/*
 	 * We must make sure the anon_vma is allocated
 	 * so that the anon_vma locking is not a noop.
 	 */
 	if (unlikely(anon_vma_prepare(vma)))
 		return -ENOMEM;
 	address &= PAGE_MASK;
 	error = security_mmap_addr(address);
 	if (error)
 		return error;
 	vma_lock_anon_vma(vma);
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode.  We need the
 	 * anon_vma lock to serialize against concurrent expand_stacks.
 	 */
 	/* Somebody else might have raced and expanded it already */
 	if (address < vma->vm_start) {
 		unsigned long size, grow;
 		size = vma->vm_end - address;
 		grow = (vma->vm_start - address) >> PAGE_SHIFT;
 		error = -ENOMEM;
 		if (grow <= vma->vm_pgoff) {
 			error = acct_stack_growth(vma, size, grow);
 			if (!error) {
 				/*
 				 * vma_gap_update() doesn't support concurrent
 				 * updates, but we only hold a shared mmap_sem
 				 * lock here, so we need to protect against
 				 * concurrent vma expansions.
 				 * vma_lock_anon_vma() doesn't help here, as
 				 * we don't guarantee that all growable vmas
 				 * in a mm share the same root anon vma.
 				 * So, we reuse mm->page_table_lock to guard
 				 * against concurrent vma expansions.
 				 */
 				spin_lock(&vma->vm_mm->page_table_lock);
 				anon_vma_interval_tree_pre_update_vma(vma);
 				vma->vm_start = address;
 				vma->vm_pgoff -= grow;
 				anon_vma_interval_tree_post_update_vma(vma);
 				vma_gap_update(vma);
 				spin_unlock(&vma->vm_mm->page_table_lock);
 				perf_event_mmap(vma);
 			}
 		}
 	}
 	vma_unlock_anon_vma(vma);
 	khugepaged_enter_vma_merge(vma);
 	validate_mm(vma->vm_mm);
 	return error;
 }
 /*
  * Note how expand_stack() refuses to expand the stack all the way to
  * abut the next virtual mapping, *unless* that mapping itself is also
  * a stack mapping. We want to leave room for a guard page, after all
  * (the guard page itself is not added here, that is done by the
  * actual page faulting logic)
  *
  * This matches the behavior of the guard page logic (see mm/memory.c:
  * check_stack_guard_page()), which only allows the guard page to be
  * removed under these circumstances.
  */
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
 	struct vm_area_struct *next;
 	address &= PAGE_MASK;
 	next = vma->vm_next;
 	if (next && next->vm_start == address + PAGE_SIZE) {
 		if (!(next->vm_flags & VM_GROWSUP))
 			return -ENOMEM;
 	}
 	return expand_upwards(vma, address);
 }
 struct vm_area_struct *
 find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma, *prev;
 	addr &= PAGE_MASK;
 	vma = find_vma_prev(mm, addr, &prev);
 	if (vma && (vma->vm_start <= addr))
 		return vma;
 	if (!prev || expand_stack(prev, addr))
 		return NULL;
 	if (prev->vm_flags & VM_LOCKED)
 		__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
 	return prev;
 }
 #else
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
 	struct vm_area_struct *prev;
 	address &= PAGE_MASK;
 	prev = vma->vm_prev;
 	if (prev && prev->vm_end == address) {
 		if (!(prev->vm_flags & VM_GROWSDOWN))
 			return -ENOMEM;
 	}
 	return expand_downwards(vma, address);
 }
 struct vm_area_struct *
 find_extend_vma(struct mm_struct * mm, unsigned long addr)
 {
 	struct vm_area_struct * vma;
 	unsigned long start;
 	addr &= PAGE_MASK;
 	vma = find_vma(mm,addr);
 	if (!vma)
 		return NULL;
 	if (vma->vm_start <= addr)
 		return vma;
 	if (!(vma->vm_flags & VM_GROWSDOWN))
 		return NULL;
 	start = vma->vm_start;
 	if (expand_stack(vma, addr))
 		return NULL;
 	if (vma->vm_flags & VM_LOCKED)
 		__mlock_vma_pages_range(vma, addr, start, NULL);
 	return vma;
 }
 #endif
 /*
  * Ok - we have the memory areas we should free on the vma list,
  * so release them, and do the vma updates.
  *
  * Called with the mm semaphore held.
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	unsigned long nr_accounted = 0;
 	/* Update high watermark before we lower total_vm */
 	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
 	} while (vma);
 	vm_unacct_memory(nr_accounted);
 	validate_mm(mm);
 }
 /*
  * Get rid of page table information in the indicated region.
  *
  * Called with the mm semaphore held.
  */
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end)
 {
 	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
 	struct mmu_gather tlb;
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, 0);
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 				 next ? next->vm_start : USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, start, end);
 }
 /*
  * Create a list of vma's touched by the unmap, removing them from the mm's
  * vma list as we go..
  */
 static void
 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct vm_area_struct *prev, unsigned long end)
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
 	unsigned long addr;
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	vma->vm_prev = NULL;
 	do {
 		vma_rb_erase(vma, &mm->mm_rb);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	if (vma) {
 		vma->vm_prev = prev;
 		vma_gap_update(vma);
 	} else
 		mm->highest_vm_end = prev ? prev->vm_end : 0;
 	tail_vma->vm_next = NULL;
 	if (mm->unmap_area == arch_unmap_area)
 		addr = prev ? prev->vm_end : mm->mmap_base;
 	else
 		addr = vma ?  vma->vm_start : mm->mmap_base;
 	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 /*
  * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
  * munmap path where it doesn't make sense to fail.
  */
 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
 {
 	struct mempolicy *pol;
 	struct vm_area_struct *new;
 	int err = -ENOMEM;
 	if (is_vm_hugetlb_page(vma) && (addr &
 					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!new)
 		goto out_err;
 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;
 	INIT_LIST_HEAD(&new->anon_vma_chain);
 	if (new_below)
 		new->vm_end = addr;
 	else {
 		new->vm_start = addr;
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 	pol = mpol_dup(vma_policy(vma));
 	if (IS_ERR(pol)) {
 		err = PTR_ERR(pol);
 		goto out_free_vma;
 	}
 	vma_set_policy(new, pol);
 	if (anon_vma_clone(new, vma))
 		goto out_free_mpol;
 	if (new->vm_file)
 		get_file(new->vm_file);
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
 	if (new_below)
 		err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 			((addr - new->vm_start) >> PAGE_SHIFT), new);
 	else
 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 	/* Success. */
 	if (!err)
 		return 0;
 	/* Clean everything up if vma_adjust failed. */
 	if (new->vm_ops && new->vm_ops->close)
 		new->vm_ops->close(new);
 	if (new->vm_file)
 		fput(new->vm_file);
 	unlink_anon_vmas(new);
  out_free_mpol:
 	mpol_put(pol);
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new);
  out_err:
 	return err;
 }
 /*
  * Split a vma into two pieces at address 'addr', a new vma is allocated
  * either for the first part or the tail.
  */
 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	      unsigned long addr, int new_below)
 {
 	if (mm->map_count >= sysctl_max_map_count)
 		return -ENOMEM;
 	return __split_vma(mm, vma, addr, new_below);
 }
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
 	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;
 	if ((len = PAGE_ALIGN(len)) == 0)
 		return -EINVAL;
 	/* Find the first overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma)
 		return 0;
 	prev = vma->vm_prev;
 	/* we have  start < vma->vm_end  */
 	/* if it doesn't overlap, we have nothing.. */
 	end = start + len;
 	if (vma->vm_start >= end)
 		return 0;
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
 	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 	 * unmapped vm_area_struct will remain in use: so lower split_vma
 	 * places tmp vma above, and higher split_vma places tmp vma below.
 	 */
 	if (start > vma->vm_start) {
 		int error;
 		/*
 		 * Make sure that map_count on return from munmap() will
 		 * not exceed its limit; but let map_count go just above
 		 * its limit temporarily, to help free resources as expected.
 		 */
 		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
 			return -ENOMEM;
 		error = __split_vma(mm, vma, start, 0);
 		if (error)
 			return error;
 		prev = vma;
 	}
 	/* Does it split the last one? */
 	last = find_vma(mm, end);
 	if (last && end > last->vm_start) {
 		int error = __split_vma(mm, last, end, 1);
 		if (error)
 			return error;
 	}
 	vma = prev? prev->vm_next: mm->mmap;
 	/*
 	 * unlock any mlock()ed ranges before detaching vmas
 	 */
 	if (mm->locked_vm) {
 		struct vm_area_struct *tmp = vma;
 		while (tmp && tmp->vm_start < end) {
 			if (tmp->vm_flags & VM_LOCKED) {
 				mm->locked_vm -= vma_pages(tmp);
 				munlock_vma_pages_all(tmp);
 			}
 			tmp = tmp->vm_next;
 		}
 	}
 	/*
 	 * Remove the vma's, and unmap the actual pages
 	 */
 	detach_vmas_to_be_unmapped(mm, vma, prev, end);
 	unmap_region(mm, vma, prev, start, end);
 	/* Fix up all other VM information */
 	remove_vma_list(mm, vma);
 	return 0;
 }
 int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;
 	down_write(&mm->mmap_sem);
 	ret = do_munmap(mm, start, len);
 	up_write(&mm->mmap_sem);
 	return ret;
 }
 EXPORT_SYMBOL(vm_munmap);
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
 	profile_munmap(addr);
 	return vm_munmap(addr, len);
 }
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
 	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 		WARN_ON(1);
 		up_read(&mm->mmap_sem);
 	}
 #endif
 }
 /*
  *  this is really a simplified "do_mmap".  it only handles
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
 static unsigned long do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
 	unsigned long flags;
 	struct rb_node ** rb_link, * rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
 	len = PAGE_ALIGN(len);
 	if (!len)
 		return addr;
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 	if (error & ~PAGE_MASK)
 		return error;
 	/*
 	 * mlock MCL_FUTURE?
 	 */
 	if (mm->def_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = len >> PAGE_SHIFT;
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 	/*
 	 * mm->mmap_sem is required to protect against another thread
 	 * changing the mappings in case we sleep.
 	 */
 	verify_mm_writelocked(mm);
 	/*
 	 * Clear old maps.  this also does some error checking for us
 	 */
  munmap_back:
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
 		if (do_munmap(mm, addr, len))
 			return -ENOMEM;
 		goto munmap_back;
 	}
 	/* Check against address space limits *after* clearing old maps... */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
 					NULL, NULL, pgoff, NULL);
 	if (vma)
 		goto out;
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
 	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	return addr;
 }
 unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long ret;
 	bool populate;
 	down_write(&mm->mmap_sem);
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, len);
 	return ret;
 }
 EXPORT_SYMBOL(vm_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
 			if (vma->vm_flags & VM_LOCKED)
 				munlock_vma_pages_all(vma);
 			vma = vma->vm_next;
 		}
 	}
 	arch_exit_mmap(mm);
 	vma = mm->mmap;
 	if (!vma)	/* Can happen if dup_mmap() received an OOM */
 		return;
 	lru_add_drain();
 	flush_cache_mm(mm);
 	tlb_gather_mmu(&tlb, mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 	/*
 	 * Walk the list again, actually closing and freeing it,
 	 * with preemption enabled, without holding any MM locks.
 	 */
 	while (vma) {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
 		vma = remove_vma(vma);
 	}
 	vm_unacct_memory(nr_accounted);
 	WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
  * then i_mmap_mutex is taken here.
  */
 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	struct vm_area_struct *prev;
 	struct rb_node **rb_link, *rb_parent;
 	/*
 	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 	 * until its first write fault, when page's anon_vma and index
 	 * are set.  But now set the vm_pgoff it will almost certainly
 	 * end up with (unless mremap moves it elsewhere before that
 	 * first wfault), so /proc/pid/maps tells a consistent story.
 	 *
 	 * By setting it to reflect the virtual start address of the
 	 * vma, merges and splits can happen in a seamless way, just
 	 * using the existing file pgoff checks and manipulations.
 	 * Similarly in do_mmap_pgoff and in do_brk.
 	 */
 	if (!vma->vm_file) {
 		BUG_ON(vma->anon_vma);
 		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 	}
 	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
 			   &prev, &rb_link, &rb_parent))
 		return -ENOMEM;
 	if ((vma->vm_flags & VM_ACCOUNT) &&
 	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
 		return -ENOMEM;
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 	return 0;
 }
 /*
  * Copy the vma structure to a new location in the same mm,
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks)
 {
 	struct vm_area_struct *vma = *vmap;
 	unsigned long vma_start = vma->vm_start;
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
 	struct mempolicy *pol;
 	bool faulted_in_anon_vma = true;
 	/*
 	 * If anonymous vma has not yet been faulted, update new pgoff
 	 * to match new location, to increase its chance of merging.
 	 */
 	if (unlikely(!vma->vm_file && !vma->anon_vma)) {
 		pgoff = addr >> PAGE_SHIFT;
 		faulted_in_anon_vma = false;
 	}
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
 		 */
 		if (unlikely(vma_start >= new_vma->vm_start &&
 			     vma_start < new_vma->vm_end)) {
 			/*
 			 * The only way we can get a vma_merge with
 			 * self during an mremap is if the vma hasn't
 			 * been faulted in yet and we were allowed to
 			 * reset the dst vma->vm_pgoff to the
 			 * destination address of the mremap to allow
 			 * the merge to happen. mremap must change the
 			 * vm_pgoff linearity between src and dst vmas
 			 * (in turn preventing a vma_merge) to be
 			 * safe. It is only safe to keep the vm_pgoff
 			 * linear if there are no pages mapped yet.
 			 */
 			VM_BUG_ON(faulted_in_anon_vma);
 			*vmap = vma = new_vma;
 		}
 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
 			pol = mpol_dup(vma_policy(vma));
 			if (IS_ERR(pol))
 				goto out_free_vma;
 			vma_set_policy(new_vma, pol);
 			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 			if (anon_vma_clone(new_vma, vma))
 				goto out_free_mempol;
 			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 			*need_rmap_locks = false;
 		}
 	}
 	return new_vma;
  out_free_mempol:
 	mpol_put(pol);
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new_vma);
 	return NULL;
 }
 /*
  * Return true if the calling process may expand its vm space by the passed
  * number of pages
  */
 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
 {
 	unsigned long cur = mm->total_vm;	/* pages */
 	unsigned long lim;
 	lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
 	if (cur + npages > lim)
 		return 0;
 	return 1;
 }
 static int special_mapping_fault(struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
 	pgoff_t pgoff;
 	struct page **pages;
 	/*
 	 * special mappings have no vm_file, and in that case, the mm
 	 * uses vm_pgoff internally. So we have to subtract it from here.
 	 * We are allowed to do this because we are the mm; do not copy
 	 * this code into drivers!
 	 */
 	pgoff = vmf->pgoff - vma->vm_pgoff;
 	for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
 		pgoff--;
 	if (*pages) {
 		struct page *page = *pages;
 		get_page(page);
 		vmf->page = page;
 		return 0;
 	}
 	return VM_FAULT_SIGBUS;
 }
 /*
  * Having a close hook prevents vma merging regardless of flags.
  */
 static void special_mapping_close(struct vm_area_struct *vma)
 {
 }
 static const struct vm_operations_struct special_mapping_vmops = {
 	.close = special_mapping_close,
 	.fault = special_mapping_fault,
 };
 /*
  * Called with mm->mmap_sem held for writing.
  * Insert a new vma covering the given region, with the given flags.
  * Its pages are supplied by the given array of struct page *.
  * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
  * The region past the last page supplied will always produce SIGBUS.
  * The array pointer and the pages it points to are assumed to stay alive
  * for as long as this mapping might exist.
  */
 int install_special_mapping(struct mm_struct *mm,
 			    unsigned long addr, unsigned long len,
 			    unsigned long vm_flags, struct page **pages)
 {
 	int ret;
 	struct vm_area_struct *vma;
 	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (unlikely(vma == NULL))
 		return -ENOMEM;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	vma->vm_ops = &special_mapping_vmops;
 	vma->vm_private_data = pages;
 	ret = insert_vm_struct(mm, vma);
 	if (ret)
 		goto out;
 	mm->total_vm += len >> PAGE_SHIFT;
 	perf_event_mmap(vma);
 	return 0;
 out:
 	kmem_cache_free(vm_area_cachep, vma);
 	return ret;
 }
 static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
 	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
 		down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
 		/*
 		 * We can safely modify head.next after taking the
 		 * anon_vma->root->rwsem. If some other vma in this mm shares
 		 * the same anon_vma we won't take it again.
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us thanks to the
 		 * anon_vma->root->rwsem.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
 				       &anon_vma->root->rb_root.rb_node))
 			BUG();
 	}
 }
 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 {
 	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
 		/*
 		 * AS_MM_ALL_LOCKS can't change from under us because
 		 * we hold the mm_all_locks_mutex.
 		 *
 		 * Operations on ->flags have to be atomic because
 		 * even if AS_MM_ALL_LOCKS is stable thanks to the
 		 * mm_all_locks_mutex, there may be other cpus
 		 * changing other bitflags in parallel to us.
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
 		mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
 	}
 }
 /*
  * This operation locks against the VM for all pte/vma/mm related
  * operations that could ever happen on a certain mm. This includes
  * vmtruncate, try_to_unmap, and all page faults.
  *
  * The caller must take the mmap_sem in write mode before calling
  * mm_take_all_locks(). The caller isn't allowed to release the
  * mmap_sem until mm_drop_all_locks() returns.
  *
  * mmap_sem in write mode is required in order to block all operations
  * that could modify pagetables and free pages without need of
  * altering the vma layout (for example populate_range() with
  * nonlinear vmas). It's also needed in write mode to avoid new
  * anon_vmas to be associated with existing vmas.
  *
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
  * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
  * We can take all the locks in random order because the VM code
  * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
  * takes more than one of them in a row. Secondly we're protected
  * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
  *
  * mm_take_all_locks() and mm_drop_all_locks are expensive operations
  * that may have to take thousand of locks.
  *
  * mm_take_all_locks() can fail if it's interrupted by signals.
  */
 int mm_take_all_locks(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	struct anon_vma_chain *avc;
 	BUG_ON(down_read_trylock(&mm->mmap_sem));
 	mutex_lock(&mm_all_locks_mutex);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (signal_pending(current))
 			goto out_unlock;
 		if (vma->vm_file && vma->vm_file->f_mapping)
 			vm_lock_mapping(mm, vma->vm_file->f_mapping);
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (signal_pending(current))
 			goto out_unlock;
 		if (vma->anon_vma)
 			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 				vm_lock_anon_vma(mm, avc->anon_vma);
 	}
 	return 0;
 out_unlock:
 	mm_drop_all_locks(mm);
 	return -EINTR;
 }
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
 		 *
 		 * We must however clear the bitflag before unlocking
 		 * the vma so the users using the anon_vma->rb_root will
 		 * never see our bitflag.
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us until we release the
 		 * anon_vma->root->rwsem.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
 					  &anon_vma->root->rb_root.rb_node))
 			BUG();
 		anon_vma_unlock_write(anon_vma);
 	}
 }
 static void vm_unlock_mapping(struct address_space *mapping)
 {
 	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
 		/*
 		 * AS_MM_ALL_LOCKS can't change to 0 from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
 		mutex_unlock(&mapping->i_mmap_mutex);
 		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
 					&mapping->flags))
 			BUG();
 	}
 }
 /*
  * The mmap_sem cannot be released by the caller until
  * mm_drop_all_locks() returns.
  */
 void mm_drop_all_locks(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	struct anon_vma_chain *avc;
 	BUG_ON(down_read_trylock(&mm->mmap_sem));
 	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (vma->anon_vma)
 			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 				vm_unlock_anon_vma(avc->anon_vma);
 		if (vma->vm_file && vma->vm_file->f_mapping)
 			vm_unlock_mapping(vma->vm_file->f_mapping);
 	}
 	mutex_unlock(&mm_all_locks_mutex);
 }
 /*
  * initialise the VMA slab
  */
 void __init mmap_init(void)
 {
 	int ret;
 	ret = percpu_counter_init(&vm_committed_as, 0);
 	VM_BUG_ON(ret);
 }
 /*
  * Initialise sysctl_user_reserve_kbytes.
  *
  * This is intended to prevent a user from starting a single memory hogging
  * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
  * mode.
  *
  * The default value is min(3% of free memory, 128MB)
  * 128MB is enough to recover with sshd/login, bash, and top/kill.
  */
 static int init_user_reserve(void)
 {
 	unsigned long free_kbytes;
 	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
 }
 module_init(init_user_reserve)
 /*
  * Initialise sysctl_admin_reserve_kbytes.
  *
  * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
  * to log in and kill a memory hogging process.
  *
  * Systems with more than 256MB will reserve 8MB, enough to recover
  * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
  * only reserve 3% of free pages by default.
  */
 static int init_admin_reserve(void)
 {
 	unsigned long free_kbytes;
 	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
 }
 module_init(init_admin_reserve)
 /*
  * Reinititalise user and admin reserves if memory is added or removed.
  *
  * The default user reserve max is 128MB, and the default max for the
  * admin reserve is 8MB. These are usually, but not always, enough to
  * enable recovery from a memory hogging process using login/sshd, a shell,
  * and tools like top. It may make sense to increase or even disable the
  * reserve depending on the existence of swap or variations in the recovery
  * tools. So, the admin may have changed them.
  *
  * If memory is added and the reserves have been eliminated or increased above
  * the default max, then we'll trust the admin.
  *
  * If memory is removed and there isn't enough free memory, then we
  * need to reset the reserves.
  *
  * Otherwise keep the reserve set by the admin.
  */
 static int reserve_mem_notifier(struct notifier_block *nb,
 			     unsigned long action, void *data)
 {
 	unsigned long tmp, free_kbytes;
 	switch (action) {
 	case MEM_ONLINE:
 		/* Default max is 128MB. Leave alone if modified by operator. */
 		tmp = sysctl_user_reserve_kbytes;
 		if (0 < tmp && tmp < (1UL << 17))
 			init_user_reserve();
 		/* Default max is 8MB.  Leave alone if modified by operator. */
 		tmp = sysctl_admin_reserve_kbytes;
 		if (0 < tmp && tmp < (1UL << 13))
 			init_admin_reserve();
 		break;
 	case MEM_OFFLINE:
 		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 		if (sysctl_user_reserve_kbytes > free_kbytes) {
 			init_user_reserve();
 			pr_info("vm.user_reserve_kbytes reset to %lu\n",
 				sysctl_user_reserve_kbytes);
 		}
 		if (sysctl_admin_reserve_kbytes > free_kbytes) {
 			init_admin_reserve();
 			pr_info("vm.admin_reserve_kbytes reset to %lu\n",
 				sysctl_admin_reserve_kbytes);
 		}
 		break;
 	default:
 		break;
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block reserve_mem_nb = {
 	.notifier_call = reserve_mem_notifier,
 };
 static int __meminit init_reserve_notifier(void)
 {
 	if (register_hotmemory_notifier(&reserve_mem_nb))
 		printk("Failed registering memory add/remove notifier for admin reserve");
 	return 0;
 }
 module_init(init_reserve_notifier)