Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/fs/namespace.c

2

* linux/fs/namespace.c

3

*

3

*

4

* (C) Copyright Al Viro 2000, 2001

4

* (C) Copyright Al Viro 2000, 2001

5

* Released under GPL v2.

5

* Released under GPL v2.

6

*

6

*

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

8

* Heavily rewritten.

8

* Heavily rewritten.

9

*/

9

*/

10

11

#include <linux/syscalls.h>

11

#include <linux/syscalls.h>

12

#include <linux/export.h>

12

#include <linux/export.h>

13

#include <linux/capability.h>

13

#include <linux/capability.h>

14

#include <linux/mnt_namespace.h>

14

#include <linux/mnt_namespace.h>

15

#include <linux/user_namespace.h>

15

#include <linux/user_namespace.h>

16

#include <linux/namei.h>

16

#include <linux/namei.h>

17

#include <linux/security.h>

17

#include <linux/security.h>

18

#include <linux/idr.h>

18

#include <linux/idr.h>

19

#include <linux/init.h> /* init_rootfs */

19

#include <linux/init.h> /* init_rootfs */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

22

#include <linux/uaccess.h>

22

#include <linux/uaccess.h>

23

#include <linux/proc_ns.h>

23

#include <linux/proc_ns.h>

24

#include <linux/magic.h>

24

#include <linux/magic.h>

25

#include <linux/bootmem.h>

25

#include <linux/bootmem.h>

26

#include <linux/task_work.h>

26

#include <linux/task_work.h>

27

#include "pnode.h"

27

#include "pnode.h"

28

#include "internal.h"

28

#include "internal.h"

29

30

static unsigned int m_hash_mask __read_mostly;

30

static unsigned int m_hash_mask __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

34

35

static __initdata unsigned long mhash_entries;

35

static __initdata unsigned long mhash_entries;

36

static int __init set_mhash_entries(char *str)

36

static int __init set_mhash_entries(char *str)

37

{

37

{

38

if (!str)

38

if (!str)

39

return 0;

39

return 0;

40

mhash_entries = simple_strtoul(str, &str, 0);

40

mhash_entries = simple_strtoul(str, &str, 0);

41

return 1;

41

return 1;

42

}

42

}

43

__setup("mhash_entries=", set_mhash_entries);

43

__setup("mhash_entries=", set_mhash_entries);

44

45

static __initdata unsigned long mphash_entries;

45

static __initdata unsigned long mphash_entries;

46

static int __init set_mphash_entries(char *str)

46

static int __init set_mphash_entries(char *str)

47

{

47

{

48

if (!str)

48

if (!str)

49

return 0;

49

return 0;

50

mphash_entries = simple_strtoul(str, &str, 0);

50

mphash_entries = simple_strtoul(str, &str, 0);

51

return 1;

51

return 1;

52

}

52

}

53

__setup("mphash_entries=", set_mphash_entries);

53

__setup("mphash_entries=", set_mphash_entries);

54

55

static u64 event;

55

static u64 event;

56

static DEFINE_IDA(mnt_id_ida);

56

static DEFINE_IDA(mnt_id_ida);

57

static DEFINE_IDA(mnt_group_ida);

57

static DEFINE_IDA(mnt_group_ida);

58

static DEFINE_SPINLOCK(mnt_id_lock);

58

static DEFINE_SPINLOCK(mnt_id_lock);

59

static int mnt_id_start = 0;

59

static int mnt_id_start = 0;

60

static int mnt_group_start = 1;

60

static int mnt_group_start = 1;

61

62

static struct hlist_head *mount_hashtable __read_mostly;

62

static struct hlist_head *mount_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

65

static DECLARE_RWSEM(namespace_sem);

65

static DECLARE_RWSEM(namespace_sem);

66

67

/* /sys/fs */

67

/* /sys/fs */

68

struct kobject *fs_kobj;

68

struct kobject *fs_kobj;

69

EXPORT_SYMBOL_GPL(fs_kobj);

69

EXPORT_SYMBOL_GPL(fs_kobj);

70

71

/*

71

/*

72

* vfsmount lock may be taken for read to prevent changes to the

72

* vfsmount lock may be taken for read to prevent changes to the

73

* vfsmount hash, ie. during mountpoint lookups or walking back

73

* vfsmount hash, ie. during mountpoint lookups or walking back

74

* up the tree.

74

* up the tree.

75

*

75

*

76

* It should be taken for write in all cases where the vfsmount

76

* It should be taken for write in all cases where the vfsmount

77

* tree or hash is modified or when a vfsmount structure is modified.

77

* tree or hash is modified or when a vfsmount structure is modified.

78

*/

78

*/

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

80

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

82

{

82

{

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

85

tmp = tmp + (tmp >> m_hash_shift);

85

tmp = tmp + (tmp >> m_hash_shift);

86

return &mount_hashtable[tmp & m_hash_mask];

86

return &mount_hashtable[tmp & m_hash_mask];

87

}

87

}

88

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

90

{

90

{

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

92

tmp = tmp + (tmp >> mp_hash_shift);

92

tmp = tmp + (tmp >> mp_hash_shift);

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

94

}

94

}

95

96

/*

96

/*

97

* allocation is serialized by namespace_sem, but we need the spinlock to

97

* allocation is serialized by namespace_sem, but we need the spinlock to

98

* serialize with freeing.

98

* serialize with freeing.

99

*/

99

*/

100

static int mnt_alloc_id(struct mount *mnt)

100

static int mnt_alloc_id(struct mount *mnt)

101

{

101

{

102

int res;

102

int res;

103

104

retry:

104

retry:

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

106

spin_lock(&mnt_id_lock);

106

spin_lock(&mnt_id_lock);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

108

if (!res)

108

if (!res)

109

mnt_id_start = mnt->mnt_id + 1;

109

mnt_id_start = mnt->mnt_id + 1;

110

spin_unlock(&mnt_id_lock);

110

spin_unlock(&mnt_id_lock);

111

if (res == -EAGAIN)

111

if (res == -EAGAIN)

112

goto retry;

112

goto retry;

113

114

return res;

114

return res;

115

}

115

}

116

117

static void mnt_free_id(struct mount *mnt)

117

static void mnt_free_id(struct mount *mnt)

118

{

118

{

119

int id = mnt->mnt_id;

119

int id = mnt->mnt_id;

120

spin_lock(&mnt_id_lock);

120

spin_lock(&mnt_id_lock);

121

ida_remove(&mnt_id_ida, id);

121

ida_remove(&mnt_id_ida, id);

122

if (mnt_id_start > id)

122

if (mnt_id_start > id)

123

mnt_id_start = id;

123

mnt_id_start = id;

124

spin_unlock(&mnt_id_lock);

124

spin_unlock(&mnt_id_lock);

125

}

125

}

126

127

/*

127

/*

128

* Allocate a new peer group ID

128

* Allocate a new peer group ID

129

*

129

*

130

* mnt_group_ida is protected by namespace_sem

130

* mnt_group_ida is protected by namespace_sem

131

*/

131

*/

132

static int mnt_alloc_group_id(struct mount *mnt)

132

static int mnt_alloc_group_id(struct mount *mnt)

133

{

133

{

134

int res;

134

int res;

135

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

137

return -ENOMEM;

137

return -ENOMEM;

138

139

res = ida_get_new_above(&mnt_group_ida,

139

res = ida_get_new_above(&mnt_group_ida,

140

mnt_group_start,

140

mnt_group_start,

141

&mnt->mnt_group_id);

141

&mnt->mnt_group_id);

142

if (!res)

142

if (!res)

143

mnt_group_start = mnt->mnt_group_id + 1;

143

mnt_group_start = mnt->mnt_group_id + 1;

144

145

return res;

145

return res;

146

}

146

}

147

148

/*

148

/*

149

* Release a peer group ID

149

* Release a peer group ID

150

*/

150

*/

151

void mnt_release_group_id(struct mount *mnt)

151

void mnt_release_group_id(struct mount *mnt)

152

{

152

{

153

int id = mnt->mnt_group_id;

153

int id = mnt->mnt_group_id;

154

ida_remove(&mnt_group_ida, id);

154

ida_remove(&mnt_group_ida, id);

155

if (mnt_group_start > id)

155

if (mnt_group_start > id)

156

mnt_group_start = id;

156

mnt_group_start = id;

157

mnt->mnt_group_id = 0;

157

mnt->mnt_group_id = 0;

158

}

158

}

159

160

/*

160

/*

161

* vfsmount lock must be held for read

161

* vfsmount lock must be held for read

162

*/

162

*/

163

static inline void mnt_add_count(struct mount *mnt, int n)

163

static inline void mnt_add_count(struct mount *mnt, int n)

164

{

164

{

165

#ifdef CONFIG_SMP

165

#ifdef CONFIG_SMP

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

167

#else

167

#else

168

preempt_disable();

168

preempt_disable();

169

mnt->mnt_count += n;

169

mnt->mnt_count += n;

170

preempt_enable();

170

preempt_enable();

171

#endif

171

#endif

172

}

172

}

173

174

/*

174

/*

175

* vfsmount lock must be held for write

175

* vfsmount lock must be held for write

176

*/

176

*/

177

unsigned int mnt_get_count(struct mount *mnt)

177

unsigned int mnt_get_count(struct mount *mnt)

178

{

178

{

179

#ifdef CONFIG_SMP

179

#ifdef CONFIG_SMP

180

unsigned int count = 0;

180

unsigned int count = 0;

181

int cpu;

181

int cpu;

182

183

for_each_possible_cpu(cpu) {

183

for_each_possible_cpu(cpu) {

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

185

}

185

}

186

187

return count;

187

return count;

188

#else

188

#else

189

return mnt->mnt_count;

189

return mnt->mnt_count;

190

#endif

190

#endif

191

}

191

}

192

193

static struct mount *alloc_vfsmnt(const char *name)

193

static struct mount *alloc_vfsmnt(const char *name)

194

{

194

{

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

196

if (mnt) {

196

if (mnt) {

197

int err;

197

int err;

198

199

err = mnt_alloc_id(mnt);

199

err = mnt_alloc_id(mnt);

200

if (err)

200

if (err)

201

goto out_free_cache;

201

goto out_free_cache;

202

203

if (name) {

203

if (name) {

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

205

if (!mnt->mnt_devname)

205

if (!mnt->mnt_devname)

206

goto out_free_id;

206

goto out_free_id;

207

}

207

}

208

209

#ifdef CONFIG_SMP

209

#ifdef CONFIG_SMP

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

211

if (!mnt->mnt_pcp)

211

if (!mnt->mnt_pcp)

212

goto out_free_devname;

212

goto out_free_devname;

213

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

215

#else

215

#else

216

mnt->mnt_count = 1;

216

mnt->mnt_count = 1;

217

mnt->mnt_writers = 0;

217

mnt->mnt_writers = 0;

218

#endif

218

#endif

219

220

INIT_HLIST_NODE(&mnt->mnt_hash);

220

INIT_HLIST_NODE(&mnt->mnt_hash);

221

INIT_LIST_HEAD(&mnt->mnt_child);

221

INIT_LIST_HEAD(&mnt->mnt_child);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

223

INIT_LIST_HEAD(&mnt->mnt_list);

223

INIT_LIST_HEAD(&mnt->mnt_list);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

225

INIT_LIST_HEAD(&mnt->mnt_share);

225

INIT_LIST_HEAD(&mnt->mnt_share);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

229

#ifdef CONFIG_FSNOTIFY

229

#ifdef CONFIG_FSNOTIFY

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

231

#endif

231

#endif

232

}

232

}

233

return mnt;

233

return mnt;

234

235

#ifdef CONFIG_SMP

235

#ifdef CONFIG_SMP

236

out_free_devname:

236

out_free_devname:

237

kfree(mnt->mnt_devname);

237

kfree(mnt->mnt_devname);

238

#endif

238

#endif

239

out_free_id:

239

out_free_id:

240

mnt_free_id(mnt);

240

mnt_free_id(mnt);

241

out_free_cache:

241

out_free_cache:

242

kmem_cache_free(mnt_cache, mnt);

242

kmem_cache_free(mnt_cache, mnt);

243

return NULL;

243

return NULL;

244

}

244

}

245

246

/*

246

/*

247

* Most r/o checks on a fs are for operations that take

247

* Most r/o checks on a fs are for operations that take

248

* discrete amounts of time, like a write() or unlink().

248

* discrete amounts of time, like a write() or unlink().

249

* We must keep track of when those operations start

249

* We must keep track of when those operations start

250

* (for permission checks) and when they end, so that

250

* (for permission checks) and when they end, so that

251

* we can determine when writes are able to occur to

251

* we can determine when writes are able to occur to

252

* a filesystem.

252

* a filesystem.

253

*/

253

*/

254

/*

254

/*

255

* __mnt_is_readonly: check whether a mount is read-only

255

* __mnt_is_readonly: check whether a mount is read-only

256

* @mnt: the mount to check for its write status

256

* @mnt: the mount to check for its write status

257

*

257

*

258

* This shouldn't be used directly ouside of the VFS.

258

* This shouldn't be used directly ouside of the VFS.

259

* It does not guarantee that the filesystem will stay

259

* It does not guarantee that the filesystem will stay

260

* r/w, just that it is right *now*. This can not and

260

* r/w, just that it is right *now*. This can not and

261

* should not be used in place of IS_RDONLY(inode).

261

* should not be used in place of IS_RDONLY(inode).

262

* mnt_want/drop_write() will _keep_ the filesystem

262

* mnt_want/drop_write() will _keep_ the filesystem

263

* r/w.

263

* r/w.

264

*/

264

*/

265

int __mnt_is_readonly(struct vfsmount *mnt)

265

int __mnt_is_readonly(struct vfsmount *mnt)

266

{

266

{

267

if (mnt->mnt_flags & MNT_READONLY)

267

if (mnt->mnt_flags & MNT_READONLY)

268

return 1;

268

return 1;

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

270

return 1;

270

return 1;

271

return 0;

271

return 0;

272

}

272

}

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

274

275

static inline void mnt_inc_writers(struct mount *mnt)

275

static inline void mnt_inc_writers(struct mount *mnt)

276

{

276

{

277

#ifdef CONFIG_SMP

277

#ifdef CONFIG_SMP

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

279

#else

279

#else

280

mnt->mnt_writers++;

280

mnt->mnt_writers++;

281

#endif

281

#endif

282

}

282

}

283

284

static inline void mnt_dec_writers(struct mount *mnt)

284

static inline void mnt_dec_writers(struct mount *mnt)

285

{

285

{

286

#ifdef CONFIG_SMP

286

#ifdef CONFIG_SMP

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

288

#else

288

#else

289

mnt->mnt_writers--;

289

mnt->mnt_writers--;

290

#endif

290

#endif

291

}

291

}

292

293

static unsigned int mnt_get_writers(struct mount *mnt)

293

static unsigned int mnt_get_writers(struct mount *mnt)

294

{

294

{

295

#ifdef CONFIG_SMP

295

#ifdef CONFIG_SMP

296

unsigned int count = 0;

296

unsigned int count = 0;

297

int cpu;

297

int cpu;

298

299

for_each_possible_cpu(cpu) {

299

for_each_possible_cpu(cpu) {

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

301

}

301

}

302

303

return count;

303

return count;

304

#else

304

#else

305

return mnt->mnt_writers;

305

return mnt->mnt_writers;

306

#endif

306

#endif

307

}

307

}

308

309

static int mnt_is_readonly(struct vfsmount *mnt)

309

static int mnt_is_readonly(struct vfsmount *mnt)

310

{

310

{

311

if (mnt->mnt_sb->s_readonly_remount)

311

if (mnt->mnt_sb->s_readonly_remount)

312

return 1;

312

return 1;

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

314

smp_rmb();

314

smp_rmb();

315

return __mnt_is_readonly(mnt);

315

return __mnt_is_readonly(mnt);

316

}

316

}

317

318

/*

318

/*

319

* Most r/o & frozen checks on a fs are for operations that take discrete

319

* Most r/o & frozen checks on a fs are for operations that take discrete

320

* amounts of time, like a write() or unlink(). We must keep track of when

320

* amounts of time, like a write() or unlink(). We must keep track of when

321

* those operations start (for permission checks) and when they end, so that we

321

* those operations start (for permission checks) and when they end, so that we

322

* can determine when writes are able to occur to a filesystem.

322

* can determine when writes are able to occur to a filesystem.

323

*/

323

*/

324

/**

324

/**

325

* __mnt_want_write - get write access to a mount without freeze protection

325

* __mnt_want_write - get write access to a mount without freeze protection

326

* @m: the mount on which to take a write

326

* @m: the mount on which to take a write

327

*

327

*

328

* This tells the low-level filesystem that a write is about to be performed to

328

* This tells the low-level filesystem that a write is about to be performed to

329

* it, and makes sure that writes are allowed (mnt it read-write) before

329

* it, and makes sure that writes are allowed (mnt it read-write) before

330

* returning success. This operation does not protect against filesystem being

330

* returning success. This operation does not protect against filesystem being

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

332

* called. This is effectively a refcount.

332

* called. This is effectively a refcount.

333

*/

333

*/

334

int __mnt_want_write(struct vfsmount *m)

334

int __mnt_want_write(struct vfsmount *m)

335

{

335

{

336

struct mount *mnt = real_mount(m);

336

struct mount *mnt = real_mount(m);

337

int ret = 0;

337

int ret = 0;

338

339

preempt_disable();

339

preempt_disable();

340

mnt_inc_writers(mnt);

340

mnt_inc_writers(mnt);

341

/*

341

/*

342

* The store to mnt_inc_writers must be visible before we pass

342

* The store to mnt_inc_writers must be visible before we pass

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

344

* incremented count after it has set MNT_WRITE_HOLD.

344

* incremented count after it has set MNT_WRITE_HOLD.

345

*/

345

*/

346

smp_mb();

346

smp_mb();

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

348

cpu_relax();

348

cpu_relax();

349

/*

349

/*

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

351

* be set to match its requirements. So we must not load that until

351

* be set to match its requirements. So we must not load that until

352

* MNT_WRITE_HOLD is cleared.

352

* MNT_WRITE_HOLD is cleared.

353

*/

353

*/

354

smp_rmb();

354

smp_rmb();

355

if (mnt_is_readonly(m)) {

355

if (mnt_is_readonly(m)) {

356

mnt_dec_writers(mnt);

356

mnt_dec_writers(mnt);

357

ret = -EROFS;

357

ret = -EROFS;

358

}

358

}

359

preempt_enable();

359

preempt_enable();

360

361

return ret;

361

return ret;

362

}

362

}

363

364

/**

364

/**

365

* mnt_want_write - get write access to a mount

365

* mnt_want_write - get write access to a mount

366

* @m: the mount on which to take a write

366

* @m: the mount on which to take a write

367

*

367

*

368

* This tells the low-level filesystem that a write is about to be performed to

368

* This tells the low-level filesystem that a write is about to be performed to

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

370

* is not frozen) before returning success. When the write operation is

370

* is not frozen) before returning success. When the write operation is

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

372

*/

372

*/

373

int mnt_want_write(struct vfsmount *m)

373

int mnt_want_write(struct vfsmount *m)

374

{

374

{

375

int ret;

375

int ret;

376

377

sb_start_write(m->mnt_sb);

377

sb_start_write(m->mnt_sb);

378

ret = __mnt_want_write(m);

378

ret = __mnt_want_write(m);

379

if (ret)

379

if (ret)

380

sb_end_write(m->mnt_sb);

380

sb_end_write(m->mnt_sb);

381

return ret;

381

return ret;

382

}

382

}

383

EXPORT_SYMBOL_GPL(mnt_want_write);

383

EXPORT_SYMBOL_GPL(mnt_want_write);

384

385

/**

385

/**

386

* mnt_clone_write - get write access to a mount

386

* mnt_clone_write - get write access to a mount

387

* @mnt: the mount on which to take a write

387

* @mnt: the mount on which to take a write

388

*

388

*

389

* This is effectively like mnt_want_write, except

389

* This is effectively like mnt_want_write, except

390

* it must only be used to take an extra write reference

390

* it must only be used to take an extra write reference

391

* on a mountpoint that we already know has a write reference

391

* on a mountpoint that we already know has a write reference

392

* on it. This allows some optimisation.

392

* on it. This allows some optimisation.

393

*

393

*

394

* After finished, mnt_drop_write must be called as usual to

394

* After finished, mnt_drop_write must be called as usual to

395

* drop the reference.

395

* drop the reference.

396

*/

396

*/

397

int mnt_clone_write(struct vfsmount *mnt)

397

int mnt_clone_write(struct vfsmount *mnt)

398

{

398

{

399

/* superblock may be r/o */

399

/* superblock may be r/o */

400

if (__mnt_is_readonly(mnt))

400

if (__mnt_is_readonly(mnt))

401

return -EROFS;

401

return -EROFS;

402

preempt_disable();

402

preempt_disable();

403

mnt_inc_writers(real_mount(mnt));

403

mnt_inc_writers(real_mount(mnt));

404

preempt_enable();

404

preempt_enable();

405

return 0;

405

return 0;

406

}

406

}

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

408

409

/**

409

/**

410

* __mnt_want_write_file - get write access to a file's mount

410

* __mnt_want_write_file - get write access to a file's mount

411

* @file: the file who's mount on which to take a write

411

* @file: the file who's mount on which to take a write

412

*

412

*

413

* This is like __mnt_want_write, but it takes a file and can

413

* This is like __mnt_want_write, but it takes a file and can

414

* do some optimisations if the file is open for write already

414

* do some optimisations if the file is open for write already

415

*/

415

*/

416

int __mnt_want_write_file(struct file *file)

416

int __mnt_want_write_file(struct file *file)

417

{

417

{

418

if (!(file->f_mode & FMODE_WRITER))

418

if (!(file->f_mode & FMODE_WRITER))

419

return __mnt_want_write(file->f_path.mnt);

419

return __mnt_want_write(file->f_path.mnt);

420

else

420

else

421

return mnt_clone_write(file->f_path.mnt);

421

return mnt_clone_write(file->f_path.mnt);

422

}

422

}

423

424

/**

424

/**

425

* mnt_want_write_file - get write access to a file's mount

425

* mnt_want_write_file - get write access to a file's mount

426

* @file: the file who's mount on which to take a write

426

* @file: the file who's mount on which to take a write

427

*

427

*

428

* This is like mnt_want_write, but it takes a file and can

428

* This is like mnt_want_write, but it takes a file and can

429

* do some optimisations if the file is open for write already

429

* do some optimisations if the file is open for write already

430

*/

430

*/

431

int mnt_want_write_file(struct file *file)

431

int mnt_want_write_file(struct file *file)

432

{

432

{

433

int ret;

433

int ret;

434

435

sb_start_write(file->f_path.mnt->mnt_sb);

435

sb_start_write(file->f_path.mnt->mnt_sb);

436

ret = __mnt_want_write_file(file);

436

ret = __mnt_want_write_file(file);

437

if (ret)

437

if (ret)

438

sb_end_write(file->f_path.mnt->mnt_sb);

438

sb_end_write(file->f_path.mnt->mnt_sb);

439

return ret;

439

return ret;

440

}

440

}

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

442

443

/**

443

/**

444

* __mnt_drop_write - give up write access to a mount

444

* __mnt_drop_write - give up write access to a mount

445

* @mnt: the mount on which to give up write access

445

* @mnt: the mount on which to give up write access

446

*

446

*

447

* Tells the low-level filesystem that we are done

447

* Tells the low-level filesystem that we are done

448

* performing writes to it. Must be matched with

448

* performing writes to it. Must be matched with

449

* __mnt_want_write() call above.

449

* __mnt_want_write() call above.

450

*/

450

*/

451

void __mnt_drop_write(struct vfsmount *mnt)

451

void __mnt_drop_write(struct vfsmount *mnt)

452

{

452

{

453

preempt_disable();

453

preempt_disable();

454

mnt_dec_writers(real_mount(mnt));

454

mnt_dec_writers(real_mount(mnt));

455

preempt_enable();

455

preempt_enable();

456

}

456

}

457

458

/**

458

/**

459

* mnt_drop_write - give up write access to a mount

459

* mnt_drop_write - give up write access to a mount

460

* @mnt: the mount on which to give up write access

460

* @mnt: the mount on which to give up write access

461

*

461

*

462

* Tells the low-level filesystem that we are done performing writes to it and

462

* Tells the low-level filesystem that we are done performing writes to it and

463

* also allows filesystem to be frozen again. Must be matched with

463

* also allows filesystem to be frozen again. Must be matched with

464

* mnt_want_write() call above.

464

* mnt_want_write() call above.

465

*/

465

*/

466

void mnt_drop_write(struct vfsmount *mnt)

466

void mnt_drop_write(struct vfsmount *mnt)

467

{

467

{

468

__mnt_drop_write(mnt);

468

__mnt_drop_write(mnt);

469

sb_end_write(mnt->mnt_sb);

469

sb_end_write(mnt->mnt_sb);

470

}

470

}

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

472

473

void __mnt_drop_write_file(struct file *file)

473

void __mnt_drop_write_file(struct file *file)

474

{

474

{

475

__mnt_drop_write(file->f_path.mnt);

475

__mnt_drop_write(file->f_path.mnt);

476

}

476

}

477

478

void mnt_drop_write_file(struct file *file)

478

void mnt_drop_write_file(struct file *file)

479

{

479

{

480

mnt_drop_write(file->f_path.mnt);

480

mnt_drop_write(file->f_path.mnt);

481

}

481

}

482

EXPORT_SYMBOL(mnt_drop_write_file);

482

EXPORT_SYMBOL(mnt_drop_write_file);

483

484

static int mnt_make_readonly(struct mount *mnt)

484

static int mnt_make_readonly(struct mount *mnt)

485

{

485

{

486

int ret = 0;

486

int ret = 0;

487

488

lock_mount_hash();

488

lock_mount_hash();

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

490

/*

490

/*

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

492

* should be visible before we do.

492

* should be visible before we do.

493

*/

493

*/

494

smp_mb();

494

smp_mb();

495

496

/*

496

/*

497

* With writers on hold, if this value is zero, then there are

497

* With writers on hold, if this value is zero, then there are

498

* definitely no active writers (although held writers may subsequently

498

* definitely no active writers (although held writers may subsequently

499

* increment the count, they'll have to wait, and decrement it after

499

* increment the count, they'll have to wait, and decrement it after

500

* seeing MNT_READONLY).

500

* seeing MNT_READONLY).

501

*

501

*

502

* It is OK to have counter incremented on one CPU and decremented on

502

* It is OK to have counter incremented on one CPU and decremented on

503

* another: the sum will add up correctly. The danger would be when we

503

* another: the sum will add up correctly. The danger would be when we

504

* sum up each counter, if we read a counter before it is incremented,

504

* sum up each counter, if we read a counter before it is incremented,

505

* but then read another CPU's count which it has been subsequently

505

* but then read another CPU's count which it has been subsequently

506

* decremented from -- we would see more decrements than we should.

506

* decremented from -- we would see more decrements than we should.

507

* MNT_WRITE_HOLD protects against this scenario, because

507

* MNT_WRITE_HOLD protects against this scenario, because

508

* mnt_want_write first increments count, then smp_mb, then spins on

508

* mnt_want_write first increments count, then smp_mb, then spins on

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

510

* we're counting up here.

510

* we're counting up here.

511

*/

511

*/

512

if (mnt_get_writers(mnt) > 0)

512

if (mnt_get_writers(mnt) > 0)

513

ret = -EBUSY;

513

ret = -EBUSY;

514

else

514

else

515

mnt->mnt.mnt_flags |= MNT_READONLY;

515

mnt->mnt.mnt_flags |= MNT_READONLY;

516

/*

516

/*

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

518

* that become unheld will see MNT_READONLY.

518

* that become unheld will see MNT_READONLY.

519

*/

519

*/

520

smp_wmb();

520

smp_wmb();

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

522

unlock_mount_hash();

522

unlock_mount_hash();

523

return ret;

523

return ret;

524

}

524

}

525

526

static void __mnt_unmake_readonly(struct mount *mnt)

526

static void __mnt_unmake_readonly(struct mount *mnt)

527

{

527

{

528

lock_mount_hash();

528

lock_mount_hash();

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

530

unlock_mount_hash();

530

unlock_mount_hash();

531

}

531

}

532

533

int sb_prepare_remount_readonly(struct super_block *sb)

533

int sb_prepare_remount_readonly(struct super_block *sb)

534

{

534

{

535

struct mount *mnt;

535

struct mount *mnt;

536

int err = 0;

536

int err = 0;

537

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

539

if (atomic_long_read(&sb->s_remove_count))

539

if (atomic_long_read(&sb->s_remove_count))

540

return -EBUSY;

540

return -EBUSY;

541

542

lock_mount_hash();

542

lock_mount_hash();

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

546

smp_mb();

546

smp_mb();

547

if (mnt_get_writers(mnt) > 0) {

547

if (mnt_get_writers(mnt) > 0) {

548

err = -EBUSY;

548

err = -EBUSY;

549

break;

549

break;

550

}

550

}

551

}

551

}

552

}

552

}

553

if (!err && atomic_long_read(&sb->s_remove_count))

553

if (!err && atomic_long_read(&sb->s_remove_count))

554

err = -EBUSY;

554

err = -EBUSY;

555

556

if (!err) {

556

if (!err) {

557

sb->s_readonly_remount = 1;

557

sb->s_readonly_remount = 1;

558

smp_wmb();

558

smp_wmb();

559

}

559

}

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

563

}

563

}

564

unlock_mount_hash();

564

unlock_mount_hash();

565

566

return err;

566

return err;

567

}

567

}

568

569

static void free_vfsmnt(struct mount *mnt)

569

static void free_vfsmnt(struct mount *mnt)

570

{

570

{

571

kfree(mnt->mnt_devname);

571

kfree(mnt->mnt_devname);

572

#ifdef CONFIG_SMP

572

#ifdef CONFIG_SMP

573

free_percpu(mnt->mnt_pcp);

573

free_percpu(mnt->mnt_pcp);

574

#endif

574

#endif

575

kmem_cache_free(mnt_cache, mnt);

575

kmem_cache_free(mnt_cache, mnt);

576

}

576

}

577

578

static void delayed_free_vfsmnt(struct rcu_head *head)

578

static void delayed_free_vfsmnt(struct rcu_head *head)

579

{

579

{

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

581

}

581

}

582

583

/* call under rcu_read_lock */

583

/* call under rcu_read_lock */

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

585

{

585

{

586

struct mount *mnt;

586

struct mount *mnt;

587

if (read_seqretry(&mount_lock, seq))

587

if (read_seqretry(&mount_lock, seq))

588

return false;

588

return false;

589

if (bastard == NULL)

589

if (bastard == NULL)

590

return true;

590

return true;

591

mnt = real_mount(bastard);

591

mnt = real_mount(bastard);

592

mnt_add_count(mnt, 1);

592

mnt_add_count(mnt, 1);

593

if (likely(!read_seqretry(&mount_lock, seq)))

593

if (likely(!read_seqretry(&mount_lock, seq)))

594

return true;

594

return true;

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

596

mnt_add_count(mnt, -1);

596

mnt_add_count(mnt, -1);

597

return false;

597

return false;

598

}

598

}

599

rcu_read_unlock();

599

rcu_read_unlock();

600

mntput(bastard);

600

mntput(bastard);

601

rcu_read_lock();

601

rcu_read_lock();

602

return false;

602

return false;

603

}

603

}

604

605

/*

605

/*

606

* find the first mount at @dentry on vfsmount @mnt.

606

* find the first mount at @dentry on vfsmount @mnt.

607

* call under rcu_read_lock()

607

* call under rcu_read_lock()

608

*/

608

*/

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

610

{

610

{

611

struct hlist_head *head = m_hash(mnt, dentry);

611

struct hlist_head *head = m_hash(mnt, dentry);

612

struct mount *p;

612

struct mount *p;

613

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

616

return p;

616

return p;

617

return NULL;

617

return NULL;

618

}

618

}

619

620

/*

620

/*

621

* find the last mount at @dentry on vfsmount @mnt.

621

* find the last mount at @dentry on vfsmount @mnt.

622

* mount_lock must be held.

622

* mount_lock must be held.

623

*/

623

*/

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

625

{

625

{

626

struct mount *p, *res;

626

struct mount *p, *res;

627

res = p = __lookup_mnt(mnt, dentry);

627

res = p = __lookup_mnt(mnt, dentry);

628

if (!p)

628

if (!p)

629

goto out;

629

goto out;

630

hlist_for_each_entry_continue(p, mnt_hash) {

630

hlist_for_each_entry_continue(p, mnt_hash) {

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

632

break;

632

break;

633

res = p;

633

res = p;

634

}

634

}

635

out:

635

out:

636

return res;

636

return res;

637

}

637

}

638

639

/*

639

/*

640

* lookup_mnt - Return the first child mount mounted at path

640

* lookup_mnt - Return the first child mount mounted at path

641

*

641

*

642

* "First" means first mounted chronologically. If you create the

642

* "First" means first mounted chronologically. If you create the

643

* following mounts:

643

* following mounts:

644

*

644

*

645

* mount /dev/sda1 /mnt

645

* mount /dev/sda1 /mnt

646

* mount /dev/sda2 /mnt

646

* mount /dev/sda2 /mnt

647

* mount /dev/sda3 /mnt

647

* mount /dev/sda3 /mnt

648

*

648

*

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

650

* return successively the root dentry and vfsmount of /dev/sda1, then

650

* return successively the root dentry and vfsmount of /dev/sda1, then

651

* /dev/sda2, then /dev/sda3, then NULL.

651

* /dev/sda2, then /dev/sda3, then NULL.

652

*

652

*

653

* lookup_mnt takes a reference to the found vfsmount.

653

* lookup_mnt takes a reference to the found vfsmount.

654

*/

654

*/

655

struct vfsmount *lookup_mnt(struct path *path)

655

struct vfsmount *lookup_mnt(struct path *path)

656

{

656

{

657

struct mount *child_mnt;

657

struct mount *child_mnt;

658

struct vfsmount *m;

658

struct vfsmount *m;

659

unsigned seq;

659

unsigned seq;

660

661

rcu_read_lock();

661

rcu_read_lock();

662

do {

662

do {

663

seq = read_seqbegin(&mount_lock);

663

seq = read_seqbegin(&mount_lock);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

665

m = child_mnt ? &child_mnt->mnt : NULL;

665

m = child_mnt ? &child_mnt->mnt : NULL;

666

} while (!legitimize_mnt(m, seq));

666

} while (!legitimize_mnt(m, seq));

667

rcu_read_unlock();

667

rcu_read_unlock();

668

return m;

668

return m;

669

}

669

}

670

671

/*

671

/*

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

673

* current mount namespace.

673

* current mount namespace.

674

*

674

*

675

* The common case is dentries are not mountpoints at all and that

675

* The common case is dentries are not mountpoints at all and that

676

* test is handled inline. For the slow case when we are actually

676

* test is handled inline. For the slow case when we are actually

677

* dealing with a mountpoint of some kind, walk through all of the

677

* dealing with a mountpoint of some kind, walk through all of the

678

* mounts in the current mount namespace and test to see if the dentry

678

* mounts in the current mount namespace and test to see if the dentry

679

* is a mountpoint.

679

* is a mountpoint.

680

*

680

*

681

* The mount_hashtable is not usable in the context because we

681

* The mount_hashtable is not usable in the context because we

682

* need to identify all mounts that may be in the current mount

682

* need to identify all mounts that may be in the current mount

683

* namespace not just a mount that happens to have some specified

683

* namespace not just a mount that happens to have some specified

684

* parent mount.

684

* parent mount.

685

*/

685

*/

686

bool __is_local_mountpoint(struct dentry *dentry)

686

bool __is_local_mountpoint(struct dentry *dentry)

687

{

687

{

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

689

struct mount *mnt;

689

struct mount *mnt;

690

bool is_covered = false;

690

bool is_covered = false;

691

692

if (!d_mountpoint(dentry))

692

if (!d_mountpoint(dentry))

693

goto out;

693

goto out;

694

695

down_read(&namespace_sem);

695

down_read(&namespace_sem);

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

697

is_covered = (mnt->mnt_mountpoint == dentry);

697

is_covered = (mnt->mnt_mountpoint == dentry);

698

if (is_covered)

698

if (is_covered)

699

break;

699

break;

700

}

700

}

701

up_read(&namespace_sem);

701

up_read(&namespace_sem);

702

out:

702

out:

703

return is_covered;

703

return is_covered;

704

}

704

}

705

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

707

{

707

{

708

struct hlist_head *chain = mp_hash(dentry);

708

struct hlist_head *chain = mp_hash(dentry);

709

struct mountpoint *mp;

709

struct mountpoint *mp;

710

711

hlist_for_each_entry(mp, chain, m_hash) {

711

hlist_for_each_entry(mp, chain, m_hash) {

712

if (mp->m_dentry == dentry) {

712

if (mp->m_dentry == dentry) {

713

/* might be worth a WARN_ON() */

713

/* might be worth a WARN_ON() */

714

if (d_unlinked(dentry))

714

if (d_unlinked(dentry))

715

return ERR_PTR(-ENOENT);

715

return ERR_PTR(-ENOENT);

716

mp->m_count++;

716

mp->m_count++;

717

return mp;

717

return mp;

718

}

718

}

719

}

719

}

720

return NULL;

720

return NULL;

721

}

721

}

722

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

724

{

724

{

725

struct hlist_head *chain = mp_hash(dentry);

725

struct hlist_head *chain = mp_hash(dentry);

726

struct mountpoint *mp;

726

struct mountpoint *mp;

727

int ret;

727

int ret;

728

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

730

if (!mp)

730

if (!mp)

731

return ERR_PTR(-ENOMEM);

731

return ERR_PTR(-ENOMEM);

732

733

ret = d_set_mounted(dentry);

733

ret = d_set_mounted(dentry);

734

if (ret) {

734

if (ret) {

735

kfree(mp);

735

kfree(mp);

736

return ERR_PTR(ret);

736

return ERR_PTR(ret);

737

}

737

}

738

739

mp->m_dentry = dentry;

739

mp->m_dentry = dentry;

740

mp->m_count = 1;

740

mp->m_count = 1;

741

hlist_add_head(&mp->m_hash, chain);

741

hlist_add_head(&mp->m_hash, chain);

742

INIT_HLIST_HEAD(&mp->m_list);

742

INIT_HLIST_HEAD(&mp->m_list);

743

return mp;

743

return mp;

744

}

744

}

745

746

static void put_mountpoint(struct mountpoint *mp)

746

static void put_mountpoint(struct mountpoint *mp)

747

{

747

{

748

if (!--mp->m_count) {

748

if (!--mp->m_count) {

749

struct dentry *dentry = mp->m_dentry;

749

struct dentry *dentry = mp->m_dentry;

750

BUG_ON(!hlist_empty(&mp->m_list));

750

BUG_ON(!hlist_empty(&mp->m_list));

751

spin_lock(&dentry->d_lock);

751

spin_lock(&dentry->d_lock);

752

dentry->d_flags &= ~DCACHE_MOUNTED;

752

dentry->d_flags &= ~DCACHE_MOUNTED;

753

spin_unlock(&dentry->d_lock);

753

spin_unlock(&dentry->d_lock);

754

hlist_del(&mp->m_hash);

754

hlist_del(&mp->m_hash);

755

kfree(mp);

755

kfree(mp);

756

}

756

}

757

}

757

}

758

759

static inline int check_mnt(struct mount *mnt)

759

static inline int check_mnt(struct mount *mnt)

760

{

760

{

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

762

}

762

}

763

764

/*

764

/*

765

* vfsmount lock must be held for write

765

* vfsmount lock must be held for write

766

*/

766

*/

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

768

{

768

{

769

if (ns) {

769

if (ns) {

770

ns->event = ++event;

770

ns->event = ++event;

771

wake_up_interruptible(&ns->poll);

771

wake_up_interruptible(&ns->poll);

772

}

772

}

773

}

773

}

774

775

/*

775

/*

776

* vfsmount lock must be held for write

776

* vfsmount lock must be held for write

777

*/

777

*/

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

779

{

779

{

780

if (ns && ns->event != event) {

780

if (ns && ns->event != event) {

781

ns->event = event;

781

ns->event = event;

782

wake_up_interruptible(&ns->poll);

782

wake_up_interruptible(&ns->poll);

783

}

783

}

784

}

784

}

785

786

/*

786

/*

787

* vfsmount lock must be held for write

787

* vfsmount lock must be held for write

788

*/

788

*/

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

790

{

790

{

791

old_path->dentry = mnt->mnt_mountpoint;

791

old_path->dentry = mnt->mnt_mountpoint;

792

old_path->mnt = &mnt->mnt_parent->mnt;

792

old_path->mnt = &mnt->mnt_parent->mnt;

793

mnt->mnt_parent = mnt;

793

mnt->mnt_parent = mnt;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

795

list_del_init(&mnt->mnt_child);

795

list_del_init(&mnt->mnt_child);

796

hlist_del_init_rcu(&mnt->mnt_hash);

796

hlist_del_init_rcu(&mnt->mnt_hash);

797

hlist_del_init(&mnt->mnt_mp_list);

797

hlist_del_init(&mnt->mnt_mp_list);

798

put_mountpoint(mnt->mnt_mp);

798

put_mountpoint(mnt->mnt_mp);

799

mnt->mnt_mp = NULL;

799

mnt->mnt_mp = NULL;

800

}

800

}

801

802

/*

802

/*

803

* vfsmount lock must be held for write

803

* vfsmount lock must be held for write

804

*/

804

*/

805

void mnt_set_mountpoint(struct mount *mnt,

805

void mnt_set_mountpoint(struct mount *mnt,

806

struct mountpoint *mp,

806

struct mountpoint *mp,

807

struct mount *child_mnt)

807

struct mount *child_mnt)

808

{

808

{

809

mp->m_count++;

809

mp->m_count++;

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

812

child_mnt->mnt_parent = mnt;

812

child_mnt->mnt_parent = mnt;

813

child_mnt->mnt_mp = mp;

813

child_mnt->mnt_mp = mp;

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

815

}

815

}

816

817

/*

817

/*

818

* vfsmount lock must be held for write

818

* vfsmount lock must be held for write

819

*/

819

*/

820

static void attach_mnt(struct mount *mnt,

820

static void attach_mnt(struct mount *mnt,

821

struct mount *parent,

821

struct mount *parent,

822

struct mountpoint *mp)

822

struct mountpoint *mp)

823

{

823

{

824

mnt_set_mountpoint(parent, mp, mnt);

824

mnt_set_mountpoint(parent, mp, mnt);

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

827

}

827

}

828

829

static void attach_shadowed(struct mount *mnt,

829

static void attach_shadowed(struct mount *mnt,

830

struct mount *parent,

830

struct mount *parent,

831

struct mount *shadows)

831

struct mount *shadows)

832

{

832

{

833

if (shadows) {

833

if (shadows) {

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

836

} else {

836

} else {

837

hlist_add_head_rcu(&mnt->mnt_hash,

837

hlist_add_head_rcu(&mnt->mnt_hash,

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

840

}

840

}

841

}

841

}

842

843

/*

843

/*

844

* vfsmount lock must be held for write

844

* vfsmount lock must be held for write

845

*/

845

*/

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

847

{

847

{

848

struct mount *parent = mnt->mnt_parent;

848

struct mount *parent = mnt->mnt_parent;

849

struct mount *m;

849

struct mount *m;

850

LIST_HEAD(head);

850

LIST_HEAD(head);

851

struct mnt_namespace *n = parent->mnt_ns;

851

struct mnt_namespace *n = parent->mnt_ns;

852

853

BUG_ON(parent == mnt);

853

BUG_ON(parent == mnt);

854

855

list_add_tail(&head, &mnt->mnt_list);

855

list_add_tail(&head, &mnt->mnt_list);

856

list_for_each_entry(m, &head, mnt_list)

856

list_for_each_entry(m, &head, mnt_list)

857

m->mnt_ns = n;

857

m->mnt_ns = n;

858

859

list_splice(&head, n->list.prev);

859

list_splice(&head, n->list.prev);

860

861

attach_shadowed(mnt, parent, shadows);

861

attach_shadowed(mnt, parent, shadows);

862

touch_mnt_namespace(n);

862

touch_mnt_namespace(n);

863

}

863

}

864

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

866

{

866

{

867

struct list_head *next = p->mnt_mounts.next;

867

struct list_head *next = p->mnt_mounts.next;

868

if (next == &p->mnt_mounts) {

868

if (next == &p->mnt_mounts) {

869

while (1) {

869

while (1) {

870

if (p == root)

870

if (p == root)

871

return NULL;

871

return NULL;

872

next = p->mnt_child.next;

872

next = p->mnt_child.next;

873

if (next != &p->mnt_parent->mnt_mounts)

873

if (next != &p->mnt_parent->mnt_mounts)

874

break;

874

break;

875

p = p->mnt_parent;

875

p = p->mnt_parent;

876

}

876

}

877

}

877

}

878

return list_entry(next, struct mount, mnt_child);

878

return list_entry(next, struct mount, mnt_child);

879

}

879

}

880

881

static struct mount *skip_mnt_tree(struct mount *p)

881

static struct mount *skip_mnt_tree(struct mount *p)

882

{

882

{

883

struct list_head *prev = p->mnt_mounts.prev;

883

struct list_head *prev = p->mnt_mounts.prev;

884

while (prev != &p->mnt_mounts) {

884

while (prev != &p->mnt_mounts) {

885

p = list_entry(prev, struct mount, mnt_child);

885

p = list_entry(prev, struct mount, mnt_child);

886

prev = p->mnt_mounts.prev;

886

prev = p->mnt_mounts.prev;

887

}

887

}

888

return p;

888

return p;

889

}

889

}

890

891

struct vfsmount *

891

struct vfsmount *

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

893

{

893

{

894

struct mount *mnt;

894

struct mount *mnt;

895

struct dentry *root;

895

struct dentry *root;

896

897

if (!type)

897

if (!type)

898

return ERR_PTR(-ENODEV);

898

return ERR_PTR(-ENODEV);

899

900

mnt = alloc_vfsmnt(name);

900

mnt = alloc_vfsmnt(name);

901

if (!mnt)

901

if (!mnt)

902

return ERR_PTR(-ENOMEM);

902

return ERR_PTR(-ENOMEM);

903

904

if (flags & MS_KERNMOUNT)

904

if (flags & MS_KERNMOUNT)

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

906

907

root = mount_fs(type, flags, name, data);

907

root = mount_fs(type, flags, name, data);

908

if (IS_ERR(root)) {

908

if (IS_ERR(root)) {

909

mnt_free_id(mnt);

909

mnt_free_id(mnt);

910

free_vfsmnt(mnt);

910

free_vfsmnt(mnt);

911

return ERR_CAST(root);

911

return ERR_CAST(root);

912

}

912

}

913

914

mnt->mnt.mnt_root = root;

914

mnt->mnt.mnt_root = root;

915

mnt->mnt.mnt_sb = root->d_sb;

915

mnt->mnt.mnt_sb = root->d_sb;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

917

mnt->mnt_parent = mnt;

917

mnt->mnt_parent = mnt;

918

lock_mount_hash();

918

lock_mount_hash();

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

920

unlock_mount_hash();

920

unlock_mount_hash();

921

return &mnt->mnt;

921

return &mnt->mnt;

922

}

922

}

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

924

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

926

int flag)

926

int flag)

927

{

927

{

928

struct super_block *sb = old->mnt.mnt_sb;

928

struct super_block *sb = old->mnt.mnt_sb;

929

struct mount *mnt;

929

struct mount *mnt;

930

int err;

930

int err;

931

932

mnt = alloc_vfsmnt(old->mnt_devname);

932

mnt = alloc_vfsmnt(old->mnt_devname);

933

if (!mnt)

933

if (!mnt)

934

return ERR_PTR(-ENOMEM);

934

return ERR_PTR(-ENOMEM);

935

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

937

mnt->mnt_group_id = 0; /* not a peer of original */

937

mnt->mnt_group_id = 0; /* not a peer of original */

938

else

938

else

939

mnt->mnt_group_id = old->mnt_group_id;

939

mnt->mnt_group_id = old->mnt_group_id;

940

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

942

err = mnt_alloc_group_id(mnt);

942

err = mnt_alloc_group_id(mnt);

943

if (err)

943

if (err)

944

goto out_free;

944

goto out_free;

945

}

945

}

946

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

948

/* Don't allow unprivileged users to change mount flags */

948

/* Don't allow unprivileged users to change mount flags */

949

if (flag & CL_UNPRIVILEGED) {

949

if (flag & CL_UNPRIVILEGED) {

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

951

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

954

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

957

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

960

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

963

}

963

}

964

965

/* Don't allow unprivileged users to reveal what is under a mount */

965

/* Don't allow unprivileged users to reveal what is under a mount */

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

968

969

atomic_inc(&sb->s_active);

969

atomic_inc(&sb->s_active);

970

mnt->mnt.mnt_sb = sb;

970

mnt->mnt.mnt_sb = sb;

971

mnt->mnt.mnt_root = dget(root);

971

mnt->mnt.mnt_root = dget(root);

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

973

mnt->mnt_parent = mnt;

973

mnt->mnt_parent = mnt;

974

lock_mount_hash();

974

lock_mount_hash();

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

976

unlock_mount_hash();

976

unlock_mount_hash();

977

978

if ((flag & CL_SLAVE) ||

978

if ((flag & CL_SLAVE) ||

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

981

mnt->mnt_master = old;

981

mnt->mnt_master = old;

982

CLEAR_MNT_SHARED(mnt);

982

CLEAR_MNT_SHARED(mnt);

983

} else if (!(flag & CL_PRIVATE)) {

983

} else if (!(flag & CL_PRIVATE)) {

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

985

list_add(&mnt->mnt_share, &old->mnt_share);

985

list_add(&mnt->mnt_share, &old->mnt_share);

986

if (IS_MNT_SLAVE(old))

986

if (IS_MNT_SLAVE(old))

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

988

mnt->mnt_master = old->mnt_master;

988

mnt->mnt_master = old->mnt_master;

989

}

989

}

990

if (flag & CL_MAKE_SHARED)

990

if (flag & CL_MAKE_SHARED)

991

set_mnt_shared(mnt);

991

set_mnt_shared(mnt);

992

993

/* stick the duplicate mount on the same expiry list

993

/* stick the duplicate mount on the same expiry list

994

* as the original if that was on one */

994

* as the original if that was on one */

995

if (flag & CL_EXPIRE) {

995

if (flag & CL_EXPIRE) {

996

if (!list_empty(&old->mnt_expire))

996

if (!list_empty(&old->mnt_expire))

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

998

}

998

}

999

1000

return mnt;

1000

return mnt;

1001

1002

out_free:

1002

out_free:

1003

mnt_free_id(mnt);

1003

mnt_free_id(mnt);

1004

free_vfsmnt(mnt);

1004

free_vfsmnt(mnt);

1005

return ERR_PTR(err);

1005

return ERR_PTR(err);

1006

}

1006

}

1007

1008

static void cleanup_mnt(struct mount *mnt)

1008

static void cleanup_mnt(struct mount *mnt)

1009

{

1009

{

1010

/*

1010

/*

1011

* This probably indicates that somebody messed

1011

* This probably indicates that somebody messed

1012

* up a mnt_want/drop_write() pair. If this

1012

* up a mnt_want/drop_write() pair. If this

1013

* happens, the filesystem was probably unable

1013

* happens, the filesystem was probably unable

1014

* to make r/w->r/o transitions.

1014

* to make r/w->r/o transitions.

1015

*/

1015

*/

1016

/*

1016

/*

1017

* The locking used to deal with mnt_count decrement provides barriers,

1017

* The locking used to deal with mnt_count decrement provides barriers,

1018

* so mnt_get_writers() below is safe.

1018

* so mnt_get_writers() below is safe.

1019

*/

1019

*/

1020

WARN_ON(mnt_get_writers(mnt));

1020

WARN_ON(mnt_get_writers(mnt));

1021

if (unlikely(mnt->mnt_pins.first))

1021

if (unlikely(mnt->mnt_pins.first))

1022

mnt_pin_kill(mnt);

1022

mnt_pin_kill(mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1024

dput(mnt->mnt.mnt_root);

1024

dput(mnt->mnt.mnt_root);

1025

deactivate_super(mnt->mnt.mnt_sb);

1025

deactivate_super(mnt->mnt.mnt_sb);

1026

mnt_free_id(mnt);

1026

mnt_free_id(mnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1028

}

1028

}

1029

1030

static void __cleanup_mnt(struct rcu_head *head)

1030

static void __cleanup_mnt(struct rcu_head *head)

1031

{

1031

{

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1033

}

1033

}

1034

1035

static LLIST_HEAD(delayed_mntput_list);

1035

static LLIST_HEAD(delayed_mntput_list);

1036

static void delayed_mntput(struct work_struct *unused)

1036

static void delayed_mntput(struct work_struct *unused)

1037

{

1037

{

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1039

struct llist_node *next;

1039

struct llist_node *next;

1040

1041

for (; node; node = next) {

1041

for (; node; node = next) {

1042

next = llist_next(node);

1042

next = llist_next(node);

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1044

}

1044

}

1045

}

1045

}

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1047

1048

static void mntput_no_expire(struct mount *mnt)

1048

static void mntput_no_expire(struct mount *mnt)

1049

{

1049

{

1050

rcu_read_lock();

1050

rcu_read_lock();

1051

mnt_add_count(mnt, -1);

1051

mnt_add_count(mnt, -1);

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1053

rcu_read_unlock();

1053

rcu_read_unlock();

1054

return;

1054

return;

1055

}

1055

}

1056

lock_mount_hash();

1056

lock_mount_hash();

1057

if (mnt_get_count(mnt)) {

1057

if (mnt_get_count(mnt)) {

1058

rcu_read_unlock();

1058

rcu_read_unlock();

1059

unlock_mount_hash();

1059

unlock_mount_hash();

1060

return;

1060

return;

1061

}

1061

}

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1063

rcu_read_unlock();

1063

rcu_read_unlock();

1064

unlock_mount_hash();

1064

unlock_mount_hash();

1065

return;

1065

return;

1066

}

1066

}

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1068

rcu_read_unlock();

1068

rcu_read_unlock();

1069

1070

list_del(&mnt->mnt_instance);

1070

list_del(&mnt->mnt_instance);

1071

unlock_mount_hash();

1071

unlock_mount_hash();

1072

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1074

struct task_struct *task = current;

1074

struct task_struct *task = current;

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1078

return;

1078

return;

1079

}

1079

}

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1082

return;

1082

return;

1083

}

1083

}

1084

cleanup_mnt(mnt);

1084

cleanup_mnt(mnt);

1085

}

1085

}

1086

1087

void mntput(struct vfsmount *mnt)

1087

void mntput(struct vfsmount *mnt)

1088

{

1088

{

1089

if (mnt) {

1089

if (mnt) {

1090

struct mount *m = real_mount(mnt);

1090

struct mount *m = real_mount(mnt);

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1092

if (unlikely(m->mnt_expiry_mark))

1092

if (unlikely(m->mnt_expiry_mark))

1093

m->mnt_expiry_mark = 0;

1093

m->mnt_expiry_mark = 0;

1094

mntput_no_expire(m);

1094

mntput_no_expire(m);

1095

}

1095

}

1096

}

1096

}

1097

EXPORT_SYMBOL(mntput);

1097

EXPORT_SYMBOL(mntput);

1098

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1100

{

1100

{

1101

if (mnt)

1101

if (mnt)

1102

mnt_add_count(real_mount(mnt), 1);

1102

mnt_add_count(real_mount(mnt), 1);

1103

return mnt;

1103

return mnt;

1104

}

1104

}

1105

EXPORT_SYMBOL(mntget);

1105

EXPORT_SYMBOL(mntget);

1106

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1108

{

1108

{

1109

struct mount *p;

1109

struct mount *p;

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1111

if (IS_ERR(p))

1111

if (IS_ERR(p))

1112

return ERR_CAST(p);

1112

return ERR_CAST(p);

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1114

return &p->mnt;

1114

return &p->mnt;

1115

}

1115

}

1116

1117

static inline void mangle(struct seq_file *m, const char *s)

1117

static inline void mangle(struct seq_file *m, const char *s)

1118

{

1118

{

1119

seq_escape(m, s, " \t\n\\");

1119

seq_escape(m, s, " \t\n\\");

1120

}

1120

}

1121

1122

/*

1122

/*

1123

* Simple .show_options callback for filesystems which don't want to

1123

* Simple .show_options callback for filesystems which don't want to

1124

* implement more complex mount option showing.

1124

* implement more complex mount option showing.

1125

*

1125

*

1126

* See also save_mount_options().

1126

* See also save_mount_options().

1127

*/

1127

*/

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1129

{

1129

{

1130

const char *options;

1130

const char *options;

1131

1132

rcu_read_lock();

1132

rcu_read_lock();

1133

options = rcu_dereference(root->d_sb->s_options);

1133

options = rcu_dereference(root->d_sb->s_options);

1134

1135

if (options != NULL && options[0]) {

1135

if (options != NULL && options[0]) {

1136

seq_putc(m, ',');

1136

seq_putc(m, ',');

1137

mangle(m, options);

1137

mangle(m, options);

1138

}

1138

}

1139

rcu_read_unlock();

1139

rcu_read_unlock();

1140

1141

return 0;

1141

return 0;

1142

}

1142

}

1143

EXPORT_SYMBOL(generic_show_options);

1143

EXPORT_SYMBOL(generic_show_options);

1144

1145

/*

1145

/*

1146

* If filesystem uses generic_show_options(), this function should be

1146

* If filesystem uses generic_show_options(), this function should be

1147

* called from the fill_super() callback.

1147

* called from the fill_super() callback.

1148

*

1148

*

1149

* The .remount_fs callback usually needs to be handled in a special

1149

* The .remount_fs callback usually needs to be handled in a special

1150

* way, to make sure, that previous options are not overwritten if the

1150

* way, to make sure, that previous options are not overwritten if the

1151

* remount fails.

1151

* remount fails.

1152

*

1152

*

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1154

* reset all options to their default value, but changes only newly

1154

* reset all options to their default value, but changes only newly

1155

* given options, then the displayed options will not reflect reality

1155

* given options, then the displayed options will not reflect reality

1156

* any more.

1156

* any more.

1157

*/

1157

*/

1158

void save_mount_options(struct super_block *sb, char *options)

1158

void save_mount_options(struct super_block *sb, char *options)

1159

{

1159

{

1160

BUG_ON(sb->s_options);

1160

BUG_ON(sb->s_options);

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1162

}

1162

}

1163

EXPORT_SYMBOL(save_mount_options);

1163

EXPORT_SYMBOL(save_mount_options);

1164

1165

void replace_mount_options(struct super_block *sb, char *options)

1165

void replace_mount_options(struct super_block *sb, char *options)

1166

{

1166

{

1167

char *old = sb->s_options;

1167

char *old = sb->s_options;

1168

rcu_assign_pointer(sb->s_options, options);

1168

rcu_assign_pointer(sb->s_options, options);

1169

if (old) {

1169

if (old) {

1170

synchronize_rcu();

1170

synchronize_rcu();

1171

kfree(old);

1171

kfree(old);

1172

}

1172

}

1173

}

1173

}

1174

EXPORT_SYMBOL(replace_mount_options);

1174

EXPORT_SYMBOL(replace_mount_options);

1175

1176

#ifdef CONFIG_PROC_FS

1176

#ifdef CONFIG_PROC_FS

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1179

{

1179

{

1180

struct proc_mounts *p = proc_mounts(m);

1180

struct proc_mounts *p = proc_mounts(m);

1181

1182

down_read(&namespace_sem);

1182

down_read(&namespace_sem);

1183

if (p->cached_event == p->ns->event) {

1183

if (p->cached_event == p->ns->event) {

1184

void *v = p->cached_mount;

1184

void *v = p->cached_mount;

1185

if (*pos == p->cached_index)

1185

if (*pos == p->cached_index)

1186

return v;

1186

return v;

1187

if (*pos == p->cached_index + 1) {

1187

if (*pos == p->cached_index + 1) {

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1189

return p->cached_mount = v;

1189

return p->cached_mount = v;

1190

}

1190

}

1191

}

1191

}

1192

1193

p->cached_event = p->ns->event;

1193

p->cached_event = p->ns->event;

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1195

p->cached_index = *pos;

1195

p->cached_index = *pos;

1196

return p->cached_mount;

1196

return p->cached_mount;

1197

}

1197

}

1198

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1200

{

1200

{

1201

struct proc_mounts *p = proc_mounts(m);

1201

struct proc_mounts *p = proc_mounts(m);

1202

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1204

p->cached_index = *pos;

1204

p->cached_index = *pos;

1205

return p->cached_mount;

1205

return p->cached_mount;

1206

}

1206

}

1207

1208

static void m_stop(struct seq_file *m, void *v)

1208

static void m_stop(struct seq_file *m, void *v)

1209

{

1209

{

1210

up_read(&namespace_sem);

1210

up_read(&namespace_sem);

1211

}

1211

}

1212

1213

static int m_show(struct seq_file *m, void *v)

1213

static int m_show(struct seq_file *m, void *v)

1214

{

1214

{

1215

struct proc_mounts *p = proc_mounts(m);

1215

struct proc_mounts *p = proc_mounts(m);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1217

return p->show(m, &r->mnt);

1217

return p->show(m, &r->mnt);

1218

}

1218

}

1219

1220

const struct seq_operations mounts_op = {

1220

const struct seq_operations mounts_op = {

1221

.start = m_start,

1221

.start = m_start,

1222

.next = m_next,

1222

.next = m_next,

1223

.stop = m_stop,

1223

.stop = m_stop,

1224

.show = m_show,

1224

.show = m_show,

1225

};

1225

};

1226

#endif /* CONFIG_PROC_FS */

1226

#endif /* CONFIG_PROC_FS */

1227

1228

/**

1228

/**

1229

* may_umount_tree - check if a mount tree is busy

1229

* may_umount_tree - check if a mount tree is busy

1230

* @mnt: root of mount tree

1230

* @mnt: root of mount tree

1231

*

1231

*

1232

* This is called to check if a tree of mounts has any

1232

* This is called to check if a tree of mounts has any

1233

* open files, pwds, chroots or sub mounts that are

1233

* open files, pwds, chroots or sub mounts that are

1234

* busy.

1234

* busy.

1235

*/

1235

*/

1236

int may_umount_tree(struct vfsmount *m)

1236

int may_umount_tree(struct vfsmount *m)

1237

{

1237

{

1238

struct mount *mnt = real_mount(m);

1238

struct mount *mnt = real_mount(m);

1239

int actual_refs = 0;

1239

int actual_refs = 0;

1240

int minimum_refs = 0;

1240

int minimum_refs = 0;

1241

struct mount *p;

1241

struct mount *p;

1242

BUG_ON(!m);

1242

BUG_ON(!m);

1243

1244

/* write lock needed for mnt_get_count */

1244

/* write lock needed for mnt_get_count */

1245

lock_mount_hash();

1245

lock_mount_hash();

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1247

actual_refs += mnt_get_count(p);

1247

actual_refs += mnt_get_count(p);

1248

minimum_refs += 2;

1248

minimum_refs += 2;

1249

}

1249

}

1250

unlock_mount_hash();

1250

unlock_mount_hash();

1251

1252

if (actual_refs > minimum_refs)

1252

if (actual_refs > minimum_refs)

1253

return 0;

1253

return 0;

1254

1255

return 1;

1255

return 1;

1256

}

1256

}

1257

1258

EXPORT_SYMBOL(may_umount_tree);

1258

EXPORT_SYMBOL(may_umount_tree);

1259

1260

/**

1260

/**

1261

* may_umount - check if a mount point is busy

1261

* may_umount - check if a mount point is busy

1262

* @mnt: root of mount

1262

* @mnt: root of mount

1263

*

1263

*

1264

* This is called to check if a mount point has any

1264

* This is called to check if a mount point has any

1265

* open files, pwds, chroots or sub mounts. If the

1265

* open files, pwds, chroots or sub mounts. If the

1266

* mount has sub mounts this will return busy

1266

* mount has sub mounts this will return busy

1267

* regardless of whether the sub mounts are busy.

1267

* regardless of whether the sub mounts are busy.

1268

*

1268

*

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1270

* give false negatives. The main reason why it's here is that we need

1270

* give false negatives. The main reason why it's here is that we need

1271

* a non-destructive way to look for easily umountable filesystems.

1271

* a non-destructive way to look for easily umountable filesystems.

1272

*/

1272

*/

1273

int may_umount(struct vfsmount *mnt)

1273

int may_umount(struct vfsmount *mnt)

1274

{

1274

{

1275

int ret = 1;

1275

int ret = 1;

1276

down_read(&namespace_sem);

1276

down_read(&namespace_sem);

1277

lock_mount_hash();

1277

lock_mount_hash();

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1279

ret = 0;

1279

ret = 0;

1280

unlock_mount_hash();

1280

unlock_mount_hash();

1281

up_read(&namespace_sem);

1281

up_read(&namespace_sem);

1282

return ret;

1282

return ret;

1283

}

1283

}

1284

1285

EXPORT_SYMBOL(may_umount);

1285

EXPORT_SYMBOL(may_umount);

1286

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1288

1289

static void namespace_unlock(void)

1289

static void namespace_unlock(void)

1290

{

1290

{

1291

struct mount *mnt;

1291

struct mount *mnt;

1292

struct hlist_head head = unmounted;

1292

struct hlist_head head = unmounted;

1293

1294

if (likely(hlist_empty(&head))) {

1294

if (likely(hlist_empty(&head))) {

1295

up_write(&namespace_sem);

1295

up_write(&namespace_sem);

1296

return;

1296

return;

1297

}

1297

}

1298

1299

head.first->pprev = &head.first;

1299

head.first->pprev = &head.first;

1300

INIT_HLIST_HEAD(&unmounted);

1300

INIT_HLIST_HEAD(&unmounted);

1301

1302

/* undo decrements we'd done in umount_tree() */

1302

/* undo decrements we'd done in umount_tree() */

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1306

1307

up_write(&namespace_sem);

1307

up_write(&namespace_sem);

1308

1309

synchronize_rcu();

1309

synchronize_rcu();

1310

1311

while (!hlist_empty(&head)) {

1311

while (!hlist_empty(&head)) {

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1314

if (mnt->mnt_ex_mountpoint.mnt)

1314

if (mnt->mnt_ex_mountpoint.mnt)

1315

path_put(&mnt->mnt_ex_mountpoint);

1315

path_put(&mnt->mnt_ex_mountpoint);

1316

mntput(&mnt->mnt);

1316

mntput(&mnt->mnt);

1317

}

1317

}

1318

}

1318

}

1319

1320

static inline void namespace_lock(void)

1320

static inline void namespace_lock(void)

1321

{

1321

{

1322

down_write(&namespace_sem);

1322

down_write(&namespace_sem);

1323

}

1323

}

1324

1325

/*

1325

/*

1326

* mount_lock must be held

1326

* mount_lock must be held

1327

* namespace_sem must be held for write

1327

* namespace_sem must be held for write

1328

* how = 0 => just this tree, don't propagate

1328

* how = 0 => just this tree, don't propagate

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1330

* how = 2 => lazy umount

1330

* how = 2 => lazy umount

1331

*/

1331

*/

1332

void umount_tree(struct mount *mnt, int how)

1332

void umount_tree(struct mount *mnt, int how)

1333

{

1333

{

1334

HLIST_HEAD(tmp_list);

1334

HLIST_HEAD(tmp_list);

1335

struct mount *p;

1335

struct mount *p;

1336

struct mount *last = NULL;

1336

struct mount *last = NULL;

1337

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1339

hlist_del_init_rcu(&p->mnt_hash);

1339

hlist_del_init_rcu(&p->mnt_hash);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1341

}

1341

}

1342

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1344

list_del_init(&p->mnt_child);

1344

list_del_init(&p->mnt_child);

1345

1346

if (how)

1346

if (how)

1347

propagate_umount(&tmp_list);

1347

propagate_umount(&tmp_list);

1348

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1350

list_del_init(&p->mnt_expire);

1350

list_del_init(&p->mnt_expire);

1351

list_del_init(&p->mnt_list);

1351

list_del_init(&p->mnt_list);

1352

__touch_mnt_namespace(p->mnt_ns);

1352

__touch_mnt_namespace(p->mnt_ns);

1353

p->mnt_ns = NULL;

1353

p->mnt_ns = NULL;

1354

if (how < 2)

1354

if (how < 2)

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1356

if (mnt_has_parent(p)) {

1356

if (mnt_has_parent(p)) {

1357

hlist_del_init(&p->mnt_mp_list);

1357

hlist_del_init(&p->mnt_mp_list);

1358

put_mountpoint(p->mnt_mp);

1358

put_mountpoint(p->mnt_mp);

1359

mnt_add_count(p->mnt_parent, -1);

1359

mnt_add_count(p->mnt_parent, -1);

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1364

p->mnt_parent = p;

1364

p->mnt_parent = p;

1365

p->mnt_mp = NULL;

1365

p->mnt_mp = NULL;

1366

}

1366

}

1367

change_mnt_propagation(p, MS_PRIVATE);

1367

change_mnt_propagation(p, MS_PRIVATE);

1368

last = p;

1368

last = p;

1369

}

1369

}

1370

if (last) {

1370

if (last) {

1371

last->mnt_hash.next = unmounted.first;

1371

last->mnt_hash.next = unmounted.first;

1372

if (unmounted.first)

1373

unmounted.first->pprev = &last->mnt_hash.next;

1372

unmounted.first = tmp_list.first;

1374

unmounted.first = tmp_list.first;

1373

unmounted.first->pprev = &unmounted.first;

1375

unmounted.first->pprev = &unmounted.first;

1374

}

1376

}

1375

}

1377

}

1376

1378

1377

static void shrink_submounts(struct mount *mnt);

1379

static void shrink_submounts(struct mount *mnt);

1378

1380

1379

static int do_umount(struct mount *mnt, int flags)

1381

static int do_umount(struct mount *mnt, int flags)

1380

{

1382

{

1381

struct super_block *sb = mnt->mnt.mnt_sb;

1383

struct super_block *sb = mnt->mnt.mnt_sb;

1382

int retval;

1384

int retval;

1383

1385

1384

retval = security_sb_umount(&mnt->mnt, flags);

1386

retval = security_sb_umount(&mnt->mnt, flags);

1385

if (retval)

1387

if (retval)

1386

return retval;

1388

return retval;

1387

1389

1388

/*

1390

/*

1389

* Allow userspace to request a mountpoint be expired rather than

1391

* Allow userspace to request a mountpoint be expired rather than

1390

* unmounting unconditionally. Unmount only happens if:

1392

* unmounting unconditionally. Unmount only happens if:

1391

* (1) the mark is already set (the mark is cleared by mntput())

1393

* (1) the mark is already set (the mark is cleared by mntput())

1392

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1394

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1393

*/

1395

*/

1394

if (flags & MNT_EXPIRE) {

1396

if (flags & MNT_EXPIRE) {

1395

if (&mnt->mnt == current->fs->root.mnt ||

1397

if (&mnt->mnt == current->fs->root.mnt ||

1396

flags & (MNT_FORCE | MNT_DETACH))

1398

flags & (MNT_FORCE | MNT_DETACH))

1397

return -EINVAL;

1399

return -EINVAL;

1398

1400

1399

/*

1401

/*

1400

* probably don't strictly need the lock here if we examined

1402

* probably don't strictly need the lock here if we examined

1401

* all race cases, but it's a slowpath.

1403

* all race cases, but it's a slowpath.

1402

*/

1404

*/

1403

lock_mount_hash();

1405

lock_mount_hash();

1404

if (mnt_get_count(mnt) != 2) {

1406

if (mnt_get_count(mnt) != 2) {

1405

unlock_mount_hash();

1407

unlock_mount_hash();

1406

return -EBUSY;

1408

return -EBUSY;

1407

}

1409

}

1408

unlock_mount_hash();

1410

unlock_mount_hash();

1409

1411

1410

if (!xchg(&mnt->mnt_expiry_mark, 1))

1412

if (!xchg(&mnt->mnt_expiry_mark, 1))

1411

return -EAGAIN;

1413

return -EAGAIN;

1412

}

1414

}

1413

1415

1414

/*

1416

/*

1415

* If we may have to abort operations to get out of this

1417

* If we may have to abort operations to get out of this

1416

* mount, and they will themselves hold resources we must

1418

* mount, and they will themselves hold resources we must

1417

* allow the fs to do things. In the Unix tradition of

1419

* allow the fs to do things. In the Unix tradition of

1418

* 'Gee thats tricky lets do it in userspace' the umount_begin

1420

* 'Gee thats tricky lets do it in userspace' the umount_begin

1419

* might fail to complete on the first run through as other tasks

1421

* might fail to complete on the first run through as other tasks

1420

* must return, and the like. Thats for the mount program to worry

1422

* must return, and the like. Thats for the mount program to worry

1421

* about for the moment.

1423

* about for the moment.

1422

*/

1424

*/

1423

1425

1424

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1426

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1425

sb->s_op->umount_begin(sb);

1427

sb->s_op->umount_begin(sb);

1426

}

1428

}

1427

1429

1428

/*

1430

/*

1429

* No sense to grab the lock for this test, but test itself looks

1431

* No sense to grab the lock for this test, but test itself looks

1430

* somewhat bogus. Suggestions for better replacement?

1432

* somewhat bogus. Suggestions for better replacement?

1431

* Ho-hum... In principle, we might treat that as umount + switch

1433

* Ho-hum... In principle, we might treat that as umount + switch

1432

* to rootfs. GC would eventually take care of the old vfsmount.

1434

* to rootfs. GC would eventually take care of the old vfsmount.

1433

* Actually it makes sense, especially if rootfs would contain a

1435

* Actually it makes sense, especially if rootfs would contain a

1434

* /reboot - static binary that would close all descriptors and

1436

* /reboot - static binary that would close all descriptors and

1435

* call reboot(9). Then init(8) could umount root and exec /reboot.

1437

* call reboot(9). Then init(8) could umount root and exec /reboot.

1436

*/

1438

*/

1437

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1439

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1438

/*

1440

/*

1439

* Special case for "unmounting" root ...

1441

* Special case for "unmounting" root ...

1440

* we just try to remount it readonly.

1442

* we just try to remount it readonly.

1441

*/

1443

*/

1442

if (!capable(CAP_SYS_ADMIN))

1444

if (!capable(CAP_SYS_ADMIN))

1443

return -EPERM;

1445

return -EPERM;

1444

down_write(&sb->s_umount);

1446

down_write(&sb->s_umount);

1445

if (!(sb->s_flags & MS_RDONLY))

1447

if (!(sb->s_flags & MS_RDONLY))

1446

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1448

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1447

up_write(&sb->s_umount);

1449

up_write(&sb->s_umount);

1448

return retval;

1450

return retval;

1449

}

1451

}

1450

1452

1451

namespace_lock();

1453

namespace_lock();

1452

lock_mount_hash();

1454

lock_mount_hash();

1453

event++;

1455

event++;

1454

1456

1455

if (flags & MNT_DETACH) {

1457

if (flags & MNT_DETACH) {

1456

if (!list_empty(&mnt->mnt_list))

1458

if (!list_empty(&mnt->mnt_list))

1457

umount_tree(mnt, 2);

1459

umount_tree(mnt, 2);

1458

retval = 0;

1460

retval = 0;

1459

} else {

1461

} else {

1460

shrink_submounts(mnt);

1462

shrink_submounts(mnt);

1461

retval = -EBUSY;

1463

retval = -EBUSY;

1462

if (!propagate_mount_busy(mnt, 2)) {

1464

if (!propagate_mount_busy(mnt, 2)) {

1463

if (!list_empty(&mnt->mnt_list))

1465

if (!list_empty(&mnt->mnt_list))

1464

umount_tree(mnt, 1);

1466

umount_tree(mnt, 1);

1465

retval = 0;

1467

retval = 0;

1466

}

1468

}

1467

}

1469

}

1468

unlock_mount_hash();

1470

unlock_mount_hash();

1469

namespace_unlock();

1471

namespace_unlock();

1470

return retval;

1472

return retval;

1471

}

1473

}

1472

1474

1473

/*

1475

/*

1474

* __detach_mounts - lazily unmount all mounts on the specified dentry

1476

* __detach_mounts - lazily unmount all mounts on the specified dentry

1475

*

1477

*

1476

* During unlink, rmdir, and d_drop it is possible to loose the path

1478

* During unlink, rmdir, and d_drop it is possible to loose the path

1477

* to an existing mountpoint, and wind up leaking the mount.

1479

* to an existing mountpoint, and wind up leaking the mount.

1478

* detach_mounts allows lazily unmounting those mounts instead of

1480

* detach_mounts allows lazily unmounting those mounts instead of

1479

* leaking them.

1481

* leaking them.

1480

*

1482

*

1481

* The caller may hold dentry->d_inode->i_mutex.

1483

* The caller may hold dentry->d_inode->i_mutex.

1482

*/

1484

*/

1483

void __detach_mounts(struct dentry *dentry)

1485

void __detach_mounts(struct dentry *dentry)

1484

{

1486

{

1485

struct mountpoint *mp;

1487

struct mountpoint *mp;

1486

struct mount *mnt;

1488

struct mount *mnt;

1487

1489

1488

namespace_lock();

1490

namespace_lock();

1489

mp = lookup_mountpoint(dentry);

1491

mp = lookup_mountpoint(dentry);

1490

if (!mp)

1492

if (!mp)

1491

goto out_unlock;

1493

goto out_unlock;

1492

1494

1493

lock_mount_hash();

1495

lock_mount_hash();

1494

while (!hlist_empty(&mp->m_list)) {

1496

while (!hlist_empty(&mp->m_list)) {

1495

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1497

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1496

umount_tree(mnt, 2);

1498

umount_tree(mnt, 2);

1497

}

1499

}

1498

unlock_mount_hash();

1500

unlock_mount_hash();

1499

put_mountpoint(mp);

1501

put_mountpoint(mp);

1500

out_unlock:

1502

out_unlock:

1501

namespace_unlock();

1503

namespace_unlock();

1502

}

1504

}

1503

1505

1504

/*

1506

/*

1505

* Is the caller allowed to modify his namespace?

1507

* Is the caller allowed to modify his namespace?

1506

*/

1508

*/

1507

static inline bool may_mount(void)

1509

static inline bool may_mount(void)

1508

{

1510

{

1509

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1511

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1510

}

1512

}

1511

1513

1512

/*

1514

/*

1513

* Now umount can handle mount points as well as block devices.

1515

* Now umount can handle mount points as well as block devices.

1514

* This is important for filesystems which use unnamed block devices.

1516

* This is important for filesystems which use unnamed block devices.

1515

*

1517

*

1516

* We now support a flag for forced unmount like the other 'big iron'

1518

* We now support a flag for forced unmount like the other 'big iron'

1517

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1519

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1518

*/

1520

*/

1519

1521

1520

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1522

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1521

{

1523

{

1522

struct path path;

1524

struct path path;

1523

struct mount *mnt;

1525

struct mount *mnt;

1524

int retval;

1526

int retval;

1525

int lookup_flags = 0;

1527

int lookup_flags = 0;

1526

1528

1527

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1529

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1528

return -EINVAL;

1530

return -EINVAL;

1529

1531

1530

if (!may_mount())

1532

if (!may_mount())

1531

return -EPERM;

1533

return -EPERM;

1532

1534

1533

if (!(flags & UMOUNT_NOFOLLOW))

1535

if (!(flags & UMOUNT_NOFOLLOW))

1534

lookup_flags |= LOOKUP_FOLLOW;

1536

lookup_flags |= LOOKUP_FOLLOW;

1535

1537

1536

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1538

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1537

if (retval)

1539

if (retval)

1538

goto out;

1540

goto out;

1539

mnt = real_mount(path.mnt);

1541

mnt = real_mount(path.mnt);

1540

retval = -EINVAL;

1542

retval = -EINVAL;

1541

if (path.dentry != path.mnt->mnt_root)

1543

if (path.dentry != path.mnt->mnt_root)

1542

goto dput_and_out;

1544

goto dput_and_out;

1543

if (!check_mnt(mnt))

1545

if (!check_mnt(mnt))

1544

goto dput_and_out;

1546

goto dput_and_out;

1545

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1547

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1546

goto dput_and_out;

1548

goto dput_and_out;

1547

1549

1548

retval = do_umount(mnt, flags);

1550

retval = do_umount(mnt, flags);

1549

dput_and_out:

1551

dput_and_out:

1550

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1552

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1551

dput(path.dentry);

1553

dput(path.dentry);

1552

mntput_no_expire(mnt);

1554

mntput_no_expire(mnt);

1553

out:

1555

out:

1554

return retval;

1556

return retval;

1555

}

1557

}

1556

1558

1557

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1559

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1558

1560

1559

/*

1561

/*

1560

* The 2.0 compatible umount. No flags.

1562

* The 2.0 compatible umount. No flags.

1561

*/

1563

*/

1562

SYSCALL_DEFINE1(oldumount, char __user *, name)

1564

SYSCALL_DEFINE1(oldumount, char __user *, name)

1563

{

1565

{

1564

return sys_umount(name, 0);

1566

return sys_umount(name, 0);

1565

}

1567

}

1566

1568

1567

#endif

1569

#endif

1568

1570

1569

static bool is_mnt_ns_file(struct dentry *dentry)

1571

static bool is_mnt_ns_file(struct dentry *dentry)

1570

{

1572

{

1571

/* Is this a proxy for a mount namespace? */

1573

/* Is this a proxy for a mount namespace? */

1572

struct inode *inode = dentry->d_inode;

1574

struct inode *inode = dentry->d_inode;

1573

struct proc_ns *ei;

1575

struct proc_ns *ei;

1574

1576

1575

if (!proc_ns_inode(inode))

1577

if (!proc_ns_inode(inode))

1576

return false;

1578

return false;

1577

1579

1578

ei = get_proc_ns(inode);

1580

ei = get_proc_ns(inode);

1579

if (ei->ns_ops != &mntns_operations)

1581

if (ei->ns_ops != &mntns_operations)

1580

return false;

1582

return false;

1581

1583

1582

return true;

1584

return true;

1583

}

1585

}

1584

1586

1585

static bool mnt_ns_loop(struct dentry *dentry)

1587

static bool mnt_ns_loop(struct dentry *dentry)

1586

{

1588

{

1587

/* Could bind mounting the mount namespace inode cause a

1589

/* Could bind mounting the mount namespace inode cause a

1588

* mount namespace loop?

1590

* mount namespace loop?

1589

*/

1591

*/

1590

struct mnt_namespace *mnt_ns;

1592

struct mnt_namespace *mnt_ns;

1591

if (!is_mnt_ns_file(dentry))

1593

if (!is_mnt_ns_file(dentry))

1592

return false;

1594

return false;

1593

1595

1594

mnt_ns = get_proc_ns(dentry->d_inode)->ns;

1596

mnt_ns = get_proc_ns(dentry->d_inode)->ns;

1595

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1597

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1596

}

1598

}

1597

1599

1598

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1600

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1599

int flag)

1601

int flag)

1600

{

1602

{

1601

struct mount *res, *p, *q, *r, *parent;

1603

struct mount *res, *p, *q, *r, *parent;

1602

1604

1603

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1605

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1604

return ERR_PTR(-EINVAL);

1606

return ERR_PTR(-EINVAL);

1605

1607

1606

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1608

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1607

return ERR_PTR(-EINVAL);

1609

return ERR_PTR(-EINVAL);

1608

1610

1609

res = q = clone_mnt(mnt, dentry, flag);

1611

res = q = clone_mnt(mnt, dentry, flag);

1610

if (IS_ERR(q))

1612

if (IS_ERR(q))

1611

return q;

1613

return q;

1612

1614

1613

q->mnt.mnt_flags &= ~MNT_LOCKED;

1615

q->mnt.mnt_flags &= ~MNT_LOCKED;

1614

q->mnt_mountpoint = mnt->mnt_mountpoint;

1616

q->mnt_mountpoint = mnt->mnt_mountpoint;

1615

1617

1616

p = mnt;

1618

p = mnt;

1617

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1619

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1618

struct mount *s;

1620

struct mount *s;

1619

if (!is_subdir(r->mnt_mountpoint, dentry))

1621

if (!is_subdir(r->mnt_mountpoint, dentry))

1620

continue;

1622

continue;

1621

1623

1622

for (s = r; s; s = next_mnt(s, r)) {

1624

for (s = r; s; s = next_mnt(s, r)) {

1623

struct mount *t = NULL;

1625

struct mount *t = NULL;

1624

if (!(flag & CL_COPY_UNBINDABLE) &&

1626

if (!(flag & CL_COPY_UNBINDABLE) &&

1625

IS_MNT_UNBINDABLE(s)) {

1627

IS_MNT_UNBINDABLE(s)) {

1626

s = skip_mnt_tree(s);

1628

s = skip_mnt_tree(s);

1627

continue;

1629

continue;

1628

}

1630

}

1629

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1631

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1630

is_mnt_ns_file(s->mnt.mnt_root)) {

1632

is_mnt_ns_file(s->mnt.mnt_root)) {

1631

s = skip_mnt_tree(s);

1633

s = skip_mnt_tree(s);

1632

continue;

1634

continue;

1633

}

1635

}

1634

while (p != s->mnt_parent) {

1636

while (p != s->mnt_parent) {

1635

p = p->mnt_parent;

1637

p = p->mnt_parent;

1636

q = q->mnt_parent;

1638

q = q->mnt_parent;

1637

}

1639

}

1638

p = s;

1640

p = s;

1639

parent = q;

1641

parent = q;

1640

q = clone_mnt(p, p->mnt.mnt_root, flag);

1642

q = clone_mnt(p, p->mnt.mnt_root, flag);

1641

if (IS_ERR(q))

1643

if (IS_ERR(q))

1642

goto out;

1644

goto out;

1643

lock_mount_hash();

1645

lock_mount_hash();

1644

list_add_tail(&q->mnt_list, &res->mnt_list);

1646

list_add_tail(&q->mnt_list, &res->mnt_list);

1645

mnt_set_mountpoint(parent, p->mnt_mp, q);

1647

mnt_set_mountpoint(parent, p->mnt_mp, q);

1646

if (!list_empty(&parent->mnt_mounts)) {

1648

if (!list_empty(&parent->mnt_mounts)) {

1647

t = list_last_entry(&parent->mnt_mounts,

1649

t = list_last_entry(&parent->mnt_mounts,

1648

struct mount, mnt_child);

1650

struct mount, mnt_child);

1649

if (t->mnt_mp != p->mnt_mp)

1651

if (t->mnt_mp != p->mnt_mp)

1650

t = NULL;

1652

t = NULL;

1651

}

1653

}

1652

attach_shadowed(q, parent, t);

1654

attach_shadowed(q, parent, t);

1653

unlock_mount_hash();

1655

unlock_mount_hash();

1654

}

1656

}

1655

}

1657

}

1656

return res;

1658

return res;

1657

out:

1659

out:

1658

if (res) {

1660

if (res) {

1659

lock_mount_hash();

1661

lock_mount_hash();

1660

umount_tree(res, 0);

1662

umount_tree(res, 0);

1661

unlock_mount_hash();

1663

unlock_mount_hash();

1662

}

1664

}

1663

return q;

1665

return q;

1664

}

1666

}

1665

1667

1666

/* Caller should check returned pointer for errors */

1668

/* Caller should check returned pointer for errors */

1667

1669

1668

struct vfsmount *collect_mounts(struct path *path)

1670

struct vfsmount *collect_mounts(struct path *path)

1669

{

1671

{

1670

struct mount *tree;

1672

struct mount *tree;

1671

namespace_lock();

1673

namespace_lock();

1672

tree = copy_tree(real_mount(path->mnt), path->dentry,

1674

tree = copy_tree(real_mount(path->mnt), path->dentry,

1673

CL_COPY_ALL | CL_PRIVATE);

1675

CL_COPY_ALL | CL_PRIVATE);

1674

namespace_unlock();

1676

namespace_unlock();

1675

if (IS_ERR(tree))

1677

if (IS_ERR(tree))

1676

return ERR_CAST(tree);

1678

return ERR_CAST(tree);

1677

return &tree->mnt;

1679

return &tree->mnt;

1678

}

1680

}

1679

1681

1680

void drop_collected_mounts(struct vfsmount *mnt)

1682

void drop_collected_mounts(struct vfsmount *mnt)

1681

{

1683

{

1682

namespace_lock();

1684

namespace_lock();

1683

lock_mount_hash();

1685

lock_mount_hash();

1684

umount_tree(real_mount(mnt), 0);

1686

umount_tree(real_mount(mnt), 0);

1685

unlock_mount_hash();

1687

unlock_mount_hash();

1686

namespace_unlock();

1688

namespace_unlock();

1687

}

1689

}

1688

1690

1689

/**

1691

/**

1690

* clone_private_mount - create a private clone of a path

1692

* clone_private_mount - create a private clone of a path

1691

*

1693

*

1692

* This creates a new vfsmount, which will be the clone of @path. The new will

1694

* This creates a new vfsmount, which will be the clone of @path. The new will

1693

* not be attached anywhere in the namespace and will be private (i.e. changes

1695

* not be attached anywhere in the namespace and will be private (i.e. changes

1694

* to the originating mount won't be propagated into this).

1696

* to the originating mount won't be propagated into this).

1695

*

1697

*

1696

* Release with mntput().

1698

* Release with mntput().

1697

*/

1699

*/

1698

struct vfsmount *clone_private_mount(struct path *path)

1700

struct vfsmount *clone_private_mount(struct path *path)

1699

{

1701

{

1700

struct mount *old_mnt = real_mount(path->mnt);

1702

struct mount *old_mnt = real_mount(path->mnt);

1701

struct mount *new_mnt;

1703

struct mount *new_mnt;

1702

1704

1703

if (IS_MNT_UNBINDABLE(old_mnt))

1705

if (IS_MNT_UNBINDABLE(old_mnt))

1704

return ERR_PTR(-EINVAL);

1706

return ERR_PTR(-EINVAL);

1705

1707

1706

down_read(&namespace_sem);

1708

down_read(&namespace_sem);

1707

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1709

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1708

up_read(&namespace_sem);

1710

up_read(&namespace_sem);

1709

if (IS_ERR(new_mnt))

1711

if (IS_ERR(new_mnt))

1710

return ERR_CAST(new_mnt);

1712

return ERR_CAST(new_mnt);

1711

1713

1712

return &new_mnt->mnt;

1714

return &new_mnt->mnt;

1713

}

1715

}

1714

EXPORT_SYMBOL_GPL(clone_private_mount);

1716

EXPORT_SYMBOL_GPL(clone_private_mount);

1715

1717

1716

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1718

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1717

struct vfsmount *root)

1719

struct vfsmount *root)

1718

{

1720

{

1719

struct mount *mnt;

1721

struct mount *mnt;

1720

int res = f(root, arg);

1722

int res = f(root, arg);

1721

if (res)

1723

if (res)

1722

return res;

1724

return res;

1723

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1725

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1724

res = f(&mnt->mnt, arg);

1726

res = f(&mnt->mnt, arg);

1725

if (res)

1727

if (res)

1726

return res;

1728

return res;

1727

}

1729

}

1728

return 0;

1730

return 0;

1729

}

1731

}

1730

1732

1731

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1733

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1732

{

1734

{

1733

struct mount *p;

1735

struct mount *p;

1734

1736

1735

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1737

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1736

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1738

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1737

mnt_release_group_id(p);

1739

mnt_release_group_id(p);

1738

}

1740

}

1739

}

1741

}

1740

1742

1741

static int invent_group_ids(struct mount *mnt, bool recurse)

1743

static int invent_group_ids(struct mount *mnt, bool recurse)

1742

{

1744

{

1743

struct mount *p;

1745

struct mount *p;

1744

1746

1745

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1747

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1746

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1748

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1747

int err = mnt_alloc_group_id(p);

1749

int err = mnt_alloc_group_id(p);

1748

if (err) {

1750

if (err) {

1749

cleanup_group_ids(mnt, p);

1751

cleanup_group_ids(mnt, p);

1750

return err;

1752

return err;

1751

}

1753

}

1752

}

1754

}

1753

}

1755

}

1754

1756

1755

return 0;

1757

return 0;

1756

}

1758

}

1757

1759

1758

/*

1760

/*

1759

* @source_mnt : mount tree to be attached

1761

* @source_mnt : mount tree to be attached

1760

* @nd : place the mount tree @source_mnt is attached

1762

* @nd : place the mount tree @source_mnt is attached

1761

* @parent_nd : if non-null, detach the source_mnt from its parent and

1763

* @parent_nd : if non-null, detach the source_mnt from its parent and

1762

* store the parent mount and mountpoint dentry.

1764

* store the parent mount and mountpoint dentry.

1763

* (done when source_mnt is moved)

1765

* (done when source_mnt is moved)

1764

*

1766

*

1765

* NOTE: in the table below explains the semantics when a source mount

1767

* NOTE: in the table below explains the semantics when a source mount

1766

* of a given type is attached to a destination mount of a given type.

1768

* of a given type is attached to a destination mount of a given type.

1767

* ---------------------------------------------------------------------------

1769

* ---------------------------------------------------------------------------

1768

* | BIND MOUNT OPERATION |

1770

* | BIND MOUNT OPERATION |

1769

* |**************************************************************************

1771

* |**************************************************************************

1770

1772

1771

* | dest | | | | |

1773

* | dest | | | | |

1772

* | | | | | | |

1774

* | | | | | | |

1773

* | v | | | | |

1775

* | v | | | | |

1774

* |**************************************************************************

1776

* |**************************************************************************

1775

1777

1776

* | | | | | |

1778

* | | | | | |

1777

1779

1778

* ***************************************************************************

1780

* ***************************************************************************

1779

* A bind operation clones the source mount and mounts the clone on the

1781

* A bind operation clones the source mount and mounts the clone on the

1780

* destination mount.

1782

* destination mount.

1781

*

1783

*

1782

* (++) the cloned mount is propagated to all the mounts in the propagation

1784

* (++) the cloned mount is propagated to all the mounts in the propagation

1783

* tree of the destination mount and the cloned mount is added to

1785

* tree of the destination mount and the cloned mount is added to

1784

* the peer group of the source mount.

1786

* the peer group of the source mount.

1785

* (+) the cloned mount is created under the destination mount and is marked

1787

* (+) the cloned mount is created under the destination mount and is marked

1786

* as shared. The cloned mount is added to the peer group of the source

1788

* as shared. The cloned mount is added to the peer group of the source

1787

* mount.

1789

* mount.

1788

* (+++) the mount is propagated to all the mounts in the propagation tree

1790

* (+++) the mount is propagated to all the mounts in the propagation tree

1789

* of the destination mount and the cloned mount is made slave

1791

* of the destination mount and the cloned mount is made slave

1790

* of the same master as that of the source mount. The cloned mount

1792

* of the same master as that of the source mount. The cloned mount

1791

* is marked as 'shared and slave'.

1793

* is marked as 'shared and slave'.

1792

* (*) the cloned mount is made a slave of the same master as that of the

1794

* (*) the cloned mount is made a slave of the same master as that of the

1793

* source mount.

1795

* source mount.

1794

*

1796

*

1795

* ---------------------------------------------------------------------------

1797

* ---------------------------------------------------------------------------

1796

* | MOVE MOUNT OPERATION |

1798

* | MOVE MOUNT OPERATION |

1797

* |**************************************************************************

1799

* |**************************************************************************

1798

1800

1799

* | dest | | | | |

1801

* | dest | | | | |

1800

* | | | | | | |

1802

* | | | | | | |

1801

* | v | | | | |

1803

* | v | | | | |

1802

* |**************************************************************************

1804

* |**************************************************************************

1803

1805

1804

* | | | | | |

1806

* | | | | | |

1805

1807

1806

* ***************************************************************************

1808

* ***************************************************************************

1807

*

1809

*

1808

* (+) the mount is moved to the destination. And is then propagated to

1810

* (+) the mount is moved to the destination. And is then propagated to

1809

* all the mounts in the propagation tree of the destination mount.

1811

* all the mounts in the propagation tree of the destination mount.

1810

* (+*) the mount is moved to the destination.

1812

* (+*) the mount is moved to the destination.

1811

* (+++) the mount is moved to the destination and is then propagated to

1813

* (+++) the mount is moved to the destination and is then propagated to

1812

* all the mounts belonging to the destination mount's propagation tree.

1814

* all the mounts belonging to the destination mount's propagation tree.

1813

* the mount is marked as 'shared and slave'.

1815

* the mount is marked as 'shared and slave'.

1814

* (*) the mount continues to be a slave at the new location.

1816

* (*) the mount continues to be a slave at the new location.

1815

*

1817

*

1816

* if the source mount is a tree, the operations explained above is

1818

* if the source mount is a tree, the operations explained above is

1817

* applied to each mount in the tree.

1819

* applied to each mount in the tree.

1818

* Must be called without spinlocks held, since this function can sleep

1820

* Must be called without spinlocks held, since this function can sleep

1819

* in allocations.

1821

* in allocations.

1820

*/

1822

*/

1821

static int attach_recursive_mnt(struct mount *source_mnt,

1823

static int attach_recursive_mnt(struct mount *source_mnt,

1822

struct mount *dest_mnt,

1824

struct mount *dest_mnt,

1823

struct mountpoint *dest_mp,

1825

struct mountpoint *dest_mp,

1824

struct path *parent_path)

1826

struct path *parent_path)

1825

{

1827

{

1826

HLIST_HEAD(tree_list);

1828

HLIST_HEAD(tree_list);

1827

struct mount *child, *p;

1829

struct mount *child, *p;

1828

struct hlist_node *n;

1830

struct hlist_node *n;

1829

int err;

1831

int err;

1830

1832

1831

if (IS_MNT_SHARED(dest_mnt)) {

1833

if (IS_MNT_SHARED(dest_mnt)) {

1832

err = invent_group_ids(source_mnt, true);

1834

err = invent_group_ids(source_mnt, true);

1833

if (err)

1835

if (err)

1834

goto out;

1836

goto out;

1835

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1837

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1836

lock_mount_hash();

1838

lock_mount_hash();

1837

if (err)

1839

if (err)

1838

goto out_cleanup_ids;

1840

goto out_cleanup_ids;

1839

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1841

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1840

set_mnt_shared(p);

1842

set_mnt_shared(p);

1841

} else {

1843

} else {

1842

lock_mount_hash();

1844

lock_mount_hash();

1843

}

1845

}

1844

if (parent_path) {

1846

if (parent_path) {

1845

detach_mnt(source_mnt, parent_path);

1847

detach_mnt(source_mnt, parent_path);

1846

attach_mnt(source_mnt, dest_mnt, dest_mp);

1848

attach_mnt(source_mnt, dest_mnt, dest_mp);

1847

touch_mnt_namespace(source_mnt->mnt_ns);

1849

touch_mnt_namespace(source_mnt->mnt_ns);

1848

} else {

1850

} else {

1849

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1851

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1850

commit_tree(source_mnt, NULL);

1852

commit_tree(source_mnt, NULL);

1851

}

1853

}

1852

1854

1853

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1855

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1854

struct mount *q;

1856

struct mount *q;

1855

hlist_del_init(&child->mnt_hash);

1857

hlist_del_init(&child->mnt_hash);

1856

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1858

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1857

child->mnt_mountpoint);

1859

child->mnt_mountpoint);

1858

commit_tree(child, q);

1860

commit_tree(child, q);

1859

}

1861

}

1860

unlock_mount_hash();

1862

unlock_mount_hash();

1861

1863

1862

return 0;

1864

return 0;

1863

1865

1864

out_cleanup_ids:

1866

out_cleanup_ids:

1865

while (!hlist_empty(&tree_list)) {

1867

while (!hlist_empty(&tree_list)) {

1866

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1868

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1867

umount_tree(child, 0);

1869

umount_tree(child, 0);

1868

}

1870

}

1869

unlock_mount_hash();

1871

unlock_mount_hash();

1870

cleanup_group_ids(source_mnt, NULL);

1872

cleanup_group_ids(source_mnt, NULL);

1871

out:

1873

out:

1872

return err;

1874

return err;

1873

}

1875

}

1874

1876

1875

static struct mountpoint *lock_mount(struct path *path)

1877

static struct mountpoint *lock_mount(struct path *path)

1876

{

1878

{

1877

struct vfsmount *mnt;

1879

struct vfsmount *mnt;

1878

struct dentry *dentry = path->dentry;

1880

struct dentry *dentry = path->dentry;

1879

retry:

1881

retry:

1880

mutex_lock(&dentry->d_inode->i_mutex);

1882

mutex_lock(&dentry->d_inode->i_mutex);

1881

if (unlikely(cant_mount(dentry))) {

1883

if (unlikely(cant_mount(dentry))) {

1882

mutex_unlock(&dentry->d_inode->i_mutex);

1884

mutex_unlock(&dentry->d_inode->i_mutex);

1883

return ERR_PTR(-ENOENT);

1885

return ERR_PTR(-ENOENT);

1884

}

1886

}

1885

namespace_lock();

1887

namespace_lock();

1886

mnt = lookup_mnt(path);

1888

mnt = lookup_mnt(path);

1887

if (likely(!mnt)) {

1889

if (likely(!mnt)) {

1888

struct mountpoint *mp = lookup_mountpoint(dentry);

1890

struct mountpoint *mp = lookup_mountpoint(dentry);

1889

if (!mp)

1891

if (!mp)

1890

mp = new_mountpoint(dentry);

1892

mp = new_mountpoint(dentry);

1891

if (IS_ERR(mp)) {

1893

if (IS_ERR(mp)) {

1892

namespace_unlock();

1894

namespace_unlock();

1893

mutex_unlock(&dentry->d_inode->i_mutex);

1895

mutex_unlock(&dentry->d_inode->i_mutex);

1894

return mp;

1896

return mp;

1895

}

1897

}

1896

return mp;

1898

return mp;

1897

}

1899

}

1898

namespace_unlock();

1900

namespace_unlock();

1899

mutex_unlock(&path->dentry->d_inode->i_mutex);

1901

mutex_unlock(&path->dentry->d_inode->i_mutex);

1900

path_put(path);

1902

path_put(path);

1901

path->mnt = mnt;

1903

path->mnt = mnt;

1902

dentry = path->dentry = dget(mnt->mnt_root);

1904

dentry = path->dentry = dget(mnt->mnt_root);

1903

goto retry;

1905

goto retry;

1904

}

1906

}

1905

1907

1906

static void unlock_mount(struct mountpoint *where)

1908

static void unlock_mount(struct mountpoint *where)

1907

{

1909

{

1908

struct dentry *dentry = where->m_dentry;

1910

struct dentry *dentry = where->m_dentry;

1909

put_mountpoint(where);

1911

put_mountpoint(where);

1910

namespace_unlock();

1912

namespace_unlock();

1911

mutex_unlock(&dentry->d_inode->i_mutex);

1913

mutex_unlock(&dentry->d_inode->i_mutex);

1912

}

1914

}

1913

1915

1914

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1916

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1915

{

1917

{

1916

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1918

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1917

return -EINVAL;

1919

return -EINVAL;

1918

1920

1919

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1921

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1920

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1922

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1921

return -ENOTDIR;

1923

return -ENOTDIR;

1922

1924

1923

return attach_recursive_mnt(mnt, p, mp, NULL);

1925

return attach_recursive_mnt(mnt, p, mp, NULL);

1924

}

1926

}

1925

1927

1926

/*

1928

/*

1927

* Sanity check the flags to change_mnt_propagation.

1929

* Sanity check the flags to change_mnt_propagation.

1928

*/

1930

*/

1929

1931

1930

static int flags_to_propagation_type(int flags)

1932

static int flags_to_propagation_type(int flags)

1931

{

1933

{

1932

int type = flags & ~(MS_REC | MS_SILENT);

1934

int type = flags & ~(MS_REC | MS_SILENT);

1933

1935

1934

/* Fail if any non-propagation flags are set */

1936

/* Fail if any non-propagation flags are set */

1935

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1937

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1936

return 0;

1938

return 0;

1937

/* Only one propagation flag should be set */

1939

/* Only one propagation flag should be set */

1938

if (!is_power_of_2(type))

1940

if (!is_power_of_2(type))

1939

return 0;

1941

return 0;

1940

return type;

1942

return type;

1941

}

1943

}

1942

1944

1943

/*

1945

/*

1944

* recursively change the type of the mountpoint.

1946

* recursively change the type of the mountpoint.

1945

*/

1947

*/

1946

static int do_change_type(struct path *path, int flag)

1948

static int do_change_type(struct path *path, int flag)

1947

{

1949

{

1948

struct mount *m;

1950

struct mount *m;

1949

struct mount *mnt = real_mount(path->mnt);

1951

struct mount *mnt = real_mount(path->mnt);

1950

int recurse = flag & MS_REC;

1952

int recurse = flag & MS_REC;

1951

int type;

1953

int type;

1952

int err = 0;

1954

int err = 0;

1953

1955

1954

if (path->dentry != path->mnt->mnt_root)

1956

if (path->dentry != path->mnt->mnt_root)

1955

return -EINVAL;

1957

return -EINVAL;

1956

1958

1957

type = flags_to_propagation_type(flag);

1959

type = flags_to_propagation_type(flag);

1958

if (!type)

1960

if (!type)

1959

return -EINVAL;

1961

return -EINVAL;

1960

1962

1961

namespace_lock();

1963

namespace_lock();

1962

if (type == MS_SHARED) {

1964

if (type == MS_SHARED) {

1963

err = invent_group_ids(mnt, recurse);

1965

err = invent_group_ids(mnt, recurse);

1964

if (err)

1966

if (err)

1965

goto out_unlock;

1967

goto out_unlock;

1966

}

1968

}

1967

1969

1968

lock_mount_hash();

1970

lock_mount_hash();

1969

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1971

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1970

change_mnt_propagation(m, type);

1972

change_mnt_propagation(m, type);

1971

unlock_mount_hash();

1973

unlock_mount_hash();

1972

1974

1973

out_unlock:

1975

out_unlock:

1974

namespace_unlock();

1976

namespace_unlock();

1975

return err;

1977

return err;

1976

}

1978

}

1977

1979

1978

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1980

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1979

{

1981

{

1980

struct mount *child;

1982

struct mount *child;

1981

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1983

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1982

if (!is_subdir(child->mnt_mountpoint, dentry))

1984

if (!is_subdir(child->mnt_mountpoint, dentry))

1983

continue;

1985

continue;

1984

1986

1985

if (child->mnt.mnt_flags & MNT_LOCKED)

1987

if (child->mnt.mnt_flags & MNT_LOCKED)

1986

return true;

1988

return true;

1987

}

1989

}

1988

return false;

1990

return false;

1989

}

1991

}

1990

1992

1991

/*

1993

/*

1992

* do loopback mount.

1994

* do loopback mount.

1993

*/

1995

*/

1994

static int do_loopback(struct path *path, const char *old_name,

1996

static int do_loopback(struct path *path, const char *old_name,

1995

int recurse)

1997

int recurse)

1996

{

1998

{

1997

struct path old_path;

1999

struct path old_path;

1998

struct mount *mnt = NULL, *old, *parent;

2000

struct mount *mnt = NULL, *old, *parent;

1999

struct mountpoint *mp;

2001

struct mountpoint *mp;

2000

int err;

2002

int err;

2001

if (!old_name || !*old_name)

2003

if (!old_name || !*old_name)

2002

return -EINVAL;

2004

return -EINVAL;

2003

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

2005

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

2004

if (err)

2006

if (err)

2005

return err;

2007

return err;

2006

2008

2007

err = -EINVAL;

2009

err = -EINVAL;

2008

if (mnt_ns_loop(old_path.dentry))

2010

if (mnt_ns_loop(old_path.dentry))

2009

goto out;

2011

goto out;

2010

2012

2011

mp = lock_mount(path);

2013

mp = lock_mount(path);

2012

err = PTR_ERR(mp);

2014

err = PTR_ERR(mp);

2013

if (IS_ERR(mp))

2015

if (IS_ERR(mp))

2014

goto out;

2016

goto out;

2015

2017

2016

old = real_mount(old_path.mnt);

2018

old = real_mount(old_path.mnt);

2017

parent = real_mount(path->mnt);

2019

parent = real_mount(path->mnt);

2018

2020

2019

err = -EINVAL;

2021

err = -EINVAL;

2020

if (IS_MNT_UNBINDABLE(old))

2022

if (IS_MNT_UNBINDABLE(old))

2021

goto out2;

2023

goto out2;

2022

2024

2023

if (!check_mnt(parent) || !check_mnt(old))

2025

if (!check_mnt(parent) || !check_mnt(old))

2024

goto out2;

2026

goto out2;

2025

2027

2026

if (!recurse && has_locked_children(old, old_path.dentry))

2028

if (!recurse && has_locked_children(old, old_path.dentry))

2027

goto out2;

2029

goto out2;

2028

2030

2029

if (recurse)

2031

if (recurse)

2030

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2032

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2031

else

2033

else

2032

mnt = clone_mnt(old, old_path.dentry, 0);

2034

mnt = clone_mnt(old, old_path.dentry, 0);

2033

2035

2034

if (IS_ERR(mnt)) {

2036

if (IS_ERR(mnt)) {

2035

err = PTR_ERR(mnt);

2037

err = PTR_ERR(mnt);

2036

goto out2;

2038

goto out2;

2037

}

2039

}

2038

2040

2039

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2041

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2040

2042

2041

err = graft_tree(mnt, parent, mp);

2043

err = graft_tree(mnt, parent, mp);

2042

if (err) {

2044

if (err) {

2043

lock_mount_hash();

2045

lock_mount_hash();

2044

umount_tree(mnt, 0);

2046

umount_tree(mnt, 0);

2045

unlock_mount_hash();

2047

unlock_mount_hash();

2046

}

2048

}

2047

out2:

2049

out2:

2048

unlock_mount(mp);

2050

unlock_mount(mp);

2049

out:

2051

out:

2050

path_put(&old_path);

2052

path_put(&old_path);

2051

return err;

2053

return err;

2052

}

2054

}

2053

2055

2054

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2056

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2055

{

2057

{

2056

int error = 0;

2058

int error = 0;

2057

int readonly_request = 0;

2059

int readonly_request = 0;

2058

2060

2059

if (ms_flags & MS_RDONLY)

2061

if (ms_flags & MS_RDONLY)

2060

readonly_request = 1;

2062

readonly_request = 1;

2061

if (readonly_request == __mnt_is_readonly(mnt))

2063

if (readonly_request == __mnt_is_readonly(mnt))

2062

return 0;

2064

return 0;

2063

2065

2064

if (readonly_request)

2066

if (readonly_request)

2065

error = mnt_make_readonly(real_mount(mnt));

2067

error = mnt_make_readonly(real_mount(mnt));

2066

else

2068

else

2067

__mnt_unmake_readonly(real_mount(mnt));

2069

__mnt_unmake_readonly(real_mount(mnt));

2068

return error;

2070

return error;

2069

}

2071

}

2070

2072

2071

/*

2073

/*

2072

* change filesystem flags. dir should be a physical root of filesystem.

2074

* change filesystem flags. dir should be a physical root of filesystem.

2073

* If you've mounted a non-root directory somewhere and want to do remount

2075

* If you've mounted a non-root directory somewhere and want to do remount

2074

* on it - tough luck.

2076

* on it - tough luck.

2075

*/

2077

*/

2076

static int do_remount(struct path *path, int flags, int mnt_flags,

2078

static int do_remount(struct path *path, int flags, int mnt_flags,

2077

void *data)

2079

void *data)

2078

{

2080

{

2079

int err;

2081

int err;

2080

struct super_block *sb = path->mnt->mnt_sb;

2082

struct super_block *sb = path->mnt->mnt_sb;

2081

struct mount *mnt = real_mount(path->mnt);

2083

struct mount *mnt = real_mount(path->mnt);

2082

2084

2083

if (!check_mnt(mnt))

2085

if (!check_mnt(mnt))

2084

return -EINVAL;

2086

return -EINVAL;

2085

2087

2086

if (path->dentry != path->mnt->mnt_root)

2088

if (path->dentry != path->mnt->mnt_root)

2087

return -EINVAL;

2089

return -EINVAL;

2088

2090

2089

/* Don't allow changing of locked mnt flags.

2091

/* Don't allow changing of locked mnt flags.

2090

*

2092

*

2091

* No locks need to be held here while testing the various

2093

* No locks need to be held here while testing the various

2092

* MNT_LOCK flags because those flags can never be cleared

2094

* MNT_LOCK flags because those flags can never be cleared

2093

* once they are set.

2095

* once they are set.

2094

*/

2096

*/

2095

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2097

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2096

!(mnt_flags & MNT_READONLY)) {

2098

!(mnt_flags & MNT_READONLY)) {

2097

return -EPERM;

2099

return -EPERM;

2098

}

2100

}

2099

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2101

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2100

!(mnt_flags & MNT_NODEV)) {

2102

!(mnt_flags & MNT_NODEV)) {

2101

return -EPERM;

2103

return -EPERM;

2102

}

2104

}

2103

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2105

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2104

!(mnt_flags & MNT_NOSUID)) {

2106

!(mnt_flags & MNT_NOSUID)) {

2105

return -EPERM;

2107

return -EPERM;

2106

}

2108

}

2107

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2109

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2108

!(mnt_flags & MNT_NOEXEC)) {

2110

!(mnt_flags & MNT_NOEXEC)) {

2109

return -EPERM;

2111

return -EPERM;

2110

}

2112

}

2111

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2113

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2112

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2114

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2113

return -EPERM;

2115

return -EPERM;

2114

}

2116

}

2115

2117

2116

err = security_sb_remount(sb, data);

2118

err = security_sb_remount(sb, data);

2117

if (err)

2119

if (err)

2118

return err;

2120

return err;

2119

2121

2120

down_write(&sb->s_umount);

2122

down_write(&sb->s_umount);

2121

if (flags & MS_BIND)

2123

if (flags & MS_BIND)

2122

err = change_mount_flags(path->mnt, flags);

2124

err = change_mount_flags(path->mnt, flags);

2123

else if (!capable(CAP_SYS_ADMIN))

2125

else if (!capable(CAP_SYS_ADMIN))

2124

err = -EPERM;

2126

err = -EPERM;

2125

else

2127

else

2126

err = do_remount_sb(sb, flags, data, 0);

2128

err = do_remount_sb(sb, flags, data, 0);

2127

if (!err) {

2129

if (!err) {

2128

lock_mount_hash();

2130

lock_mount_hash();

2129

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2131

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2130

mnt->mnt.mnt_flags = mnt_flags;

2132

mnt->mnt.mnt_flags = mnt_flags;

2131

touch_mnt_namespace(mnt->mnt_ns);

2133

touch_mnt_namespace(mnt->mnt_ns);

2132

unlock_mount_hash();

2134

unlock_mount_hash();

2133

}

2135

}

2134

up_write(&sb->s_umount);

2136

up_write(&sb->s_umount);

2135

return err;

2137

return err;

2136

}

2138

}

2137

2139

2138

static inline int tree_contains_unbindable(struct mount *mnt)

2140

static inline int tree_contains_unbindable(struct mount *mnt)

2139

{

2141

{

2140

struct mount *p;

2142

struct mount *p;

2141

for (p = mnt; p; p = next_mnt(p, mnt)) {

2143

for (p = mnt; p; p = next_mnt(p, mnt)) {

2142

if (IS_MNT_UNBINDABLE(p))

2144

if (IS_MNT_UNBINDABLE(p))

2143

return 1;

2145

return 1;

2144

}

2146

}

2145

return 0;

2147

return 0;

2146

}

2148

}

2147

2149

2148

static int do_move_mount(struct path *path, const char *old_name)

2150

static int do_move_mount(struct path *path, const char *old_name)

2149

{

2151

{

2150

struct path old_path, parent_path;

2152

struct path old_path, parent_path;

2151

struct mount *p;

2153

struct mount *p;

2152

struct mount *old;

2154

struct mount *old;

2153

struct mountpoint *mp;

2155

struct mountpoint *mp;

2154

int err;

2156

int err;

2155

if (!old_name || !*old_name)

2157

if (!old_name || !*old_name)

2156

return -EINVAL;

2158

return -EINVAL;

2157

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2159

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2158

if (err)

2160

if (err)

2159

return err;

2161

return err;

2160

2162

2161

mp = lock_mount(path);

2163

mp = lock_mount(path);

2162

err = PTR_ERR(mp);

2164

err = PTR_ERR(mp);

2163

if (IS_ERR(mp))

2165

if (IS_ERR(mp))

2164

goto out;

2166

goto out;

2165

2167

2166

old = real_mount(old_path.mnt);

2168

old = real_mount(old_path.mnt);

2167

p = real_mount(path->mnt);

2169

p = real_mount(path->mnt);

2168

2170

2169

err = -EINVAL;

2171

err = -EINVAL;

2170

if (!check_mnt(p) || !check_mnt(old))

2172

if (!check_mnt(p) || !check_mnt(old))

2171

goto out1;

2173

goto out1;

2172

2174

2173

if (old->mnt.mnt_flags & MNT_LOCKED)

2175

if (old->mnt.mnt_flags & MNT_LOCKED)

2174

goto out1;

2176

goto out1;

2175

2177

2176

err = -EINVAL;

2178

err = -EINVAL;

2177

if (old_path.dentry != old_path.mnt->mnt_root)

2179

if (old_path.dentry != old_path.mnt->mnt_root)

2178

goto out1;

2180

goto out1;

2179

2181

2180

if (!mnt_has_parent(old))

2182

if (!mnt_has_parent(old))

2181

goto out1;

2183

goto out1;

2182

2184

2183

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2185

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2184

S_ISDIR(old_path.dentry->d_inode->i_mode))

2186

S_ISDIR(old_path.dentry->d_inode->i_mode))

2185

goto out1;

2187

goto out1;

2186

/*

2188

/*

2187

* Don't move a mount residing in a shared parent.

2189

* Don't move a mount residing in a shared parent.

2188

*/

2190

*/

2189

if (IS_MNT_SHARED(old->mnt_parent))

2191

if (IS_MNT_SHARED(old->mnt_parent))

2190

goto out1;

2192

goto out1;

2191

/*

2193

/*

2192

* Don't move a mount tree containing unbindable mounts to a destination

2194

* Don't move a mount tree containing unbindable mounts to a destination

2193

* mount which is shared.

2195

* mount which is shared.

2194

*/

2196

*/

2195

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2197

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2196

goto out1;

2198

goto out1;

2197

err = -ELOOP;

2199

err = -ELOOP;

2198

for (; mnt_has_parent(p); p = p->mnt_parent)

2200

for (; mnt_has_parent(p); p = p->mnt_parent)

2199

if (p == old)

2201

if (p == old)

2200

goto out1;

2202

goto out1;

2201

2203

2202

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2204

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2203

if (err)

2205

if (err)

2204

goto out1;

2206

goto out1;

2205

2207

2206

/* if the mount is moved, it should no longer be expire

2208

/* if the mount is moved, it should no longer be expire

2207

* automatically */

2209

* automatically */

2208

list_del_init(&old->mnt_expire);

2210

list_del_init(&old->mnt_expire);

2209

out1:

2211

out1:

2210

unlock_mount(mp);

2212

unlock_mount(mp);

2211

out:

2213

out:

2212

if (!err)

2214

if (!err)

2213

path_put(&parent_path);

2215

path_put(&parent_path);

2214

path_put(&old_path);

2216

path_put(&old_path);

2215

return err;

2217

return err;

2216

}

2218

}

2217

2219

2218

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2220

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2219

{

2221

{

2220

int err;

2222

int err;

2221

const char *subtype = strchr(fstype, '.');

2223

const char *subtype = strchr(fstype, '.');

2222

if (subtype) {

2224

if (subtype) {

2223

subtype++;

2225

subtype++;

2224

err = -EINVAL;

2226

err = -EINVAL;

2225

if (!subtype[0])

2227

if (!subtype[0])

2226

goto err;

2228

goto err;

2227

} else

2229

} else

2228

subtype = "";

2230

subtype = "";

2229

2231

2230

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2232

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2231

err = -ENOMEM;

2233

err = -ENOMEM;

2232

if (!mnt->mnt_sb->s_subtype)

2234

if (!mnt->mnt_sb->s_subtype)

2233

goto err;

2235

goto err;

2234

return mnt;

2236

return mnt;

2235

2237

2236

err:

2238

err:

2237

mntput(mnt);

2239

mntput(mnt);

2238

return ERR_PTR(err);

2240

return ERR_PTR(err);

2239

}

2241

}

2240

2242

2241

/*

2243

/*

2242

* add a mount into a namespace's mount tree

2244

* add a mount into a namespace's mount tree

2243

*/

2245

*/

2244

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2246

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2245

{

2247

{

2246

struct mountpoint *mp;

2248

struct mountpoint *mp;

2247

struct mount *parent;

2249

struct mount *parent;

2248

int err;

2250

int err;

2249

2251

2250

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2252

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2251

2253

2252

mp = lock_mount(path);

2254

mp = lock_mount(path);

2253

if (IS_ERR(mp))

2255

if (IS_ERR(mp))

2254

return PTR_ERR(mp);

2256

return PTR_ERR(mp);

2255

2257

2256

parent = real_mount(path->mnt);

2258

parent = real_mount(path->mnt);

2257

err = -EINVAL;

2259

err = -EINVAL;

2258

if (unlikely(!check_mnt(parent))) {

2260

if (unlikely(!check_mnt(parent))) {

2259

/* that's acceptable only for automounts done in private ns */

2261

/* that's acceptable only for automounts done in private ns */

2260

if (!(mnt_flags & MNT_SHRINKABLE))

2262

if (!(mnt_flags & MNT_SHRINKABLE))

2261

goto unlock;

2263

goto unlock;

2262

/* ... and for those we'd better have mountpoint still alive */

2264

/* ... and for those we'd better have mountpoint still alive */

2263

if (!parent->mnt_ns)

2265

if (!parent->mnt_ns)

2264

goto unlock;

2266

goto unlock;

2265

}

2267

}

2266

2268

2267

/* Refuse the same filesystem on the same mount point */

2269

/* Refuse the same filesystem on the same mount point */

2268

err = -EBUSY;

2270

err = -EBUSY;

2269

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2271

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2270

path->mnt->mnt_root == path->dentry)

2272

path->mnt->mnt_root == path->dentry)

2271

goto unlock;

2273

goto unlock;

2272

2274

2273

err = -EINVAL;

2275

err = -EINVAL;

2274

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2276

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2275

goto unlock;

2277

goto unlock;

2276

2278

2277

newmnt->mnt.mnt_flags = mnt_flags;

2279

newmnt->mnt.mnt_flags = mnt_flags;

2278

err = graft_tree(newmnt, parent, mp);

2280

err = graft_tree(newmnt, parent, mp);

2279

2281

2280

unlock:

2282

unlock:

2281

unlock_mount(mp);

2283

unlock_mount(mp);

2282

return err;

2284

return err;

2283

}

2285

}

2284

2286

2285

/*

2287

/*

2286

* create a new mount for userspace and request it to be added into the

2288

* create a new mount for userspace and request it to be added into the

2287

* namespace's tree

2289

* namespace's tree

2288

*/

2290

*/

2289

static int do_new_mount(struct path *path, const char *fstype, int flags,

2291

static int do_new_mount(struct path *path, const char *fstype, int flags,

2290

int mnt_flags, const char *name, void *data)

2292

int mnt_flags, const char *name, void *data)

2291

{

2293

{

2292

struct file_system_type *type;

2294

struct file_system_type *type;

2293

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2295

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2294

struct vfsmount *mnt;

2296

struct vfsmount *mnt;

2295

int err;

2297

int err;

2296

2298

2297

if (!fstype)

2299

if (!fstype)

2298

return -EINVAL;

2300

return -EINVAL;

2299

2301

2300

type = get_fs_type(fstype);

2302

type = get_fs_type(fstype);

2301

if (!type)

2303

if (!type)

2302

return -ENODEV;

2304

return -ENODEV;

2303

2305

2304

if (user_ns != &init_user_ns) {

2306

if (user_ns != &init_user_ns) {

2305

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2307

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2306

put_filesystem(type);

2308

put_filesystem(type);

2307

return -EPERM;

2309

return -EPERM;

2308

}

2310

}

2309

/* Only in special cases allow devices from mounts

2311

/* Only in special cases allow devices from mounts

2310

* created outside the initial user namespace.

2312

* created outside the initial user namespace.

2311

*/

2313

*/

2312

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2314

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2313

flags |= MS_NODEV;

2315

flags |= MS_NODEV;

2314

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2316

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2315

}

2317

}

2316

}

2318

}

2317

2319

2318

mnt = vfs_kern_mount(type, flags, name, data);

2320

mnt = vfs_kern_mount(type, flags, name, data);

2319

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2321

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2320

!mnt->mnt_sb->s_subtype)

2322

!mnt->mnt_sb->s_subtype)

2321

mnt = fs_set_subtype(mnt, fstype);

2323

mnt = fs_set_subtype(mnt, fstype);

2322

2324

2323

put_filesystem(type);

2325

put_filesystem(type);

2324

if (IS_ERR(mnt))

2326

if (IS_ERR(mnt))

2325

return PTR_ERR(mnt);

2327

return PTR_ERR(mnt);

2326

2328

2327

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2329

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2328

if (err)

2330

if (err)

2329

mntput(mnt);

2331

mntput(mnt);

2330

return err;

2332

return err;

2331

}

2333

}

2332

2334

2333

int finish_automount(struct vfsmount *m, struct path *path)

2335

int finish_automount(struct vfsmount *m, struct path *path)

2334

{

2336

{

2335

struct mount *mnt = real_mount(m);

2337

struct mount *mnt = real_mount(m);

2336

int err;

2338

int err;

2337

/* The new mount record should have at least 2 refs to prevent it being

2339

/* The new mount record should have at least 2 refs to prevent it being

2338

* expired before we get a chance to add it

2340

* expired before we get a chance to add it

2339

*/

2341

*/

2340

BUG_ON(mnt_get_count(mnt) < 2);

2342

BUG_ON(mnt_get_count(mnt) < 2);

2341

2343

2342

if (m->mnt_sb == path->mnt->mnt_sb &&

2344

if (m->mnt_sb == path->mnt->mnt_sb &&

2343

m->mnt_root == path->dentry) {

2345

m->mnt_root == path->dentry) {

2344

err = -ELOOP;

2346

err = -ELOOP;

2345

goto fail;

2347

goto fail;

2346

}

2348

}

2347

2349

2348

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2350

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2349

if (!err)

2351

if (!err)

2350

return 0;

2352

return 0;

2351

fail:

2353

fail:

2352

/* remove m from any expiration list it may be on */

2354

/* remove m from any expiration list it may be on */

2353

if (!list_empty(&mnt->mnt_expire)) {

2355

if (!list_empty(&mnt->mnt_expire)) {

2354

namespace_lock();

2356

namespace_lock();

2355

list_del_init(&mnt->mnt_expire);

2357

list_del_init(&mnt->mnt_expire);

2356

namespace_unlock();

2358

namespace_unlock();

2357

}

2359

}

2358

mntput(m);

2360

mntput(m);

2359

mntput(m);

2361

mntput(m);

2360

return err;

2362

return err;

2361

}

2363

}

2362

2364

2363

/**

2365

/**

2364

* mnt_set_expiry - Put a mount on an expiration list

2366

* mnt_set_expiry - Put a mount on an expiration list

2365

* @mnt: The mount to list.

2367

* @mnt: The mount to list.

2366

* @expiry_list: The list to add the mount to.

2368

* @expiry_list: The list to add the mount to.

2367

*/

2369

*/

2368

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2370

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2369

{

2371

{

2370

namespace_lock();

2372

namespace_lock();

2371

2373

2372

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2374

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2373

2375

2374

namespace_unlock();

2376

namespace_unlock();

2375

}

2377

}

2376

EXPORT_SYMBOL(mnt_set_expiry);

2378

EXPORT_SYMBOL(mnt_set_expiry);

2377

2379

2378

/*

2380

/*

2379

* process a list of expirable mountpoints with the intent of discarding any

2381

* process a list of expirable mountpoints with the intent of discarding any

2380

* mountpoints that aren't in use and haven't been touched since last we came

2382

* mountpoints that aren't in use and haven't been touched since last we came

2381

* here

2383

* here

2382

*/

2384

*/

2383

void mark_mounts_for_expiry(struct list_head *mounts)

2385

void mark_mounts_for_expiry(struct list_head *mounts)

2384

{

2386

{

2385

struct mount *mnt, *next;

2387

struct mount *mnt, *next;

2386

LIST_HEAD(graveyard);

2388

LIST_HEAD(graveyard);

2387

2389

2388

if (list_empty(mounts))

2390

if (list_empty(mounts))

2389

return;

2391

return;

2390

2392

2391

namespace_lock();

2393

namespace_lock();

2392

lock_mount_hash();

2394

lock_mount_hash();

2393

2395

2394

/* extract from the expiration list every vfsmount that matches the

2396

/* extract from the expiration list every vfsmount that matches the

2395

* following criteria:

2397

* following criteria:

2396

* - only referenced by its parent vfsmount

2398

* - only referenced by its parent vfsmount

2397

* - still marked for expiry (marked on the last call here; marks are

2399

* - still marked for expiry (marked on the last call here; marks are

2398

* cleared by mntput())

2400

* cleared by mntput())

2399

*/

2401

*/

2400

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2402

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2401

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2403

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2402

propagate_mount_busy(mnt, 1))

2404

propagate_mount_busy(mnt, 1))

2403

continue;

2405

continue;

2404

list_move(&mnt->mnt_expire, &graveyard);

2406

list_move(&mnt->mnt_expire, &graveyard);

2405

}

2407

}

2406

while (!list_empty(&graveyard)) {

2408

while (!list_empty(&graveyard)) {

2407

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2409

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2408

touch_mnt_namespace(mnt->mnt_ns);

2410

touch_mnt_namespace(mnt->mnt_ns);

2409

umount_tree(mnt, 1);

2411

umount_tree(mnt, 1);

2410

}

2412

}

2411

unlock_mount_hash();

2413

unlock_mount_hash();

2412

namespace_unlock();

2414

namespace_unlock();

2413

}

2415

}

2414

2416

2415

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2417

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2416

2418

2417

/*

2419

/*

2418

* Ripoff of 'select_parent()'

2420

* Ripoff of 'select_parent()'

2419

*

2421

*

2420

* search the list of submounts for a given mountpoint, and move any

2422

* search the list of submounts for a given mountpoint, and move any

2421

* shrinkable submounts to the 'graveyard' list.

2423

* shrinkable submounts to the 'graveyard' list.

2422

*/

2424

*/

2423

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2425

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2424

{

2426

{

2425

struct mount *this_parent = parent;

2427

struct mount *this_parent = parent;

2426

struct list_head *next;

2428

struct list_head *next;

2427

int found = 0;

2429

int found = 0;

2428

2430

2429

repeat:

2431

repeat:

2430

next = this_parent->mnt_mounts.next;

2432

next = this_parent->mnt_mounts.next;

2431

resume:

2433

resume:

2432

while (next != &this_parent->mnt_mounts) {

2434

while (next != &this_parent->mnt_mounts) {

2433

struct list_head *tmp = next;

2435

struct list_head *tmp = next;

2434

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2436

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2435

2437

2436

next = tmp->next;

2438

next = tmp->next;

2437

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2439

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2438

continue;

2440

continue;

2439

/*

2441

/*

2440

* Descend a level if the d_mounts list is non-empty.

2442

* Descend a level if the d_mounts list is non-empty.

2441

*/

2443

*/

2442

if (!list_empty(&mnt->mnt_mounts)) {

2444

if (!list_empty(&mnt->mnt_mounts)) {

2443

this_parent = mnt;

2445

this_parent = mnt;

2444

goto repeat;

2446

goto repeat;

2445

}

2447

}

2446

2448

2447

if (!propagate_mount_busy(mnt, 1)) {

2449

if (!propagate_mount_busy(mnt, 1)) {

2448

list_move_tail(&mnt->mnt_expire, graveyard);

2450

list_move_tail(&mnt->mnt_expire, graveyard);

2449

found++;

2451

found++;

2450

}

2452

}

2451

}

2453

}

2452

/*

2454

/*

2453

* All done at this level ... ascend and resume the search

2455

* All done at this level ... ascend and resume the search

2454

*/

2456

*/

2455

if (this_parent != parent) {

2457

if (this_parent != parent) {

2456

next = this_parent->mnt_child.next;

2458

next = this_parent->mnt_child.next;

2457

this_parent = this_parent->mnt_parent;

2459

this_parent = this_parent->mnt_parent;

2458

goto resume;

2460

goto resume;

2459

}

2461

}

2460

return found;

2462

return found;

2461

}

2463

}

2462

2464

2463

/*

2465

/*

2464

* process a list of expirable mountpoints with the intent of discarding any

2466

* process a list of expirable mountpoints with the intent of discarding any

2465

* submounts of a specific parent mountpoint

2467

* submounts of a specific parent mountpoint

2466

*

2468

*

2467

* mount_lock must be held for write

2469

* mount_lock must be held for write

2468

*/

2470

*/

2469

static void shrink_submounts(struct mount *mnt)

2471

static void shrink_submounts(struct mount *mnt)

2470

{

2472

{

2471

LIST_HEAD(graveyard);

2473

LIST_HEAD(graveyard);

2472

struct mount *m;

2474

struct mount *m;

2473

2475

2474

/* extract submounts of 'mountpoint' from the expiration list */

2476

/* extract submounts of 'mountpoint' from the expiration list */

2475

while (select_submounts(mnt, &graveyard)) {

2477

while (select_submounts(mnt, &graveyard)) {

2476

while (!list_empty(&graveyard)) {

2478

while (!list_empty(&graveyard)) {

2477

m = list_first_entry(&graveyard, struct mount,

2479

m = list_first_entry(&graveyard, struct mount,

2478

mnt_expire);

2480

mnt_expire);

2479

touch_mnt_namespace(m->mnt_ns);

2481

touch_mnt_namespace(m->mnt_ns);

2480

umount_tree(m, 1);

2482

umount_tree(m, 1);

2481

}

2483

}

2482

}

2484

}

2483

}

2485

}

2484

2486

2485

/*

2487

/*

2486

* Some copy_from_user() implementations do not return the exact number of

2488

* Some copy_from_user() implementations do not return the exact number of

2487

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2489

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2488

* Note that this function differs from copy_from_user() in that it will oops

2490

* Note that this function differs from copy_from_user() in that it will oops

2489

* on bad values of `to', rather than returning a short copy.

2491

* on bad values of `to', rather than returning a short copy.

2490

*/

2492

*/

2491

static long exact_copy_from_user(void *to, const void __user * from,

2493

static long exact_copy_from_user(void *to, const void __user * from,

2492

unsigned long n)

2494

unsigned long n)

2493

{

2495

{

2494

char *t = to;

2496

char *t = to;

2495

const char __user *f = from;

2497

const char __user *f = from;

2496

char c;

2498

char c;

2497

2499

2498

if (!access_ok(VERIFY_READ, from, n))

2500

if (!access_ok(VERIFY_READ, from, n))

2499

return n;

2501

return n;

2500

2502

2501

while (n) {

2503

while (n) {

2502

if (__get_user(c, f)) {

2504

if (__get_user(c, f)) {

2503

memset(t, 0, n);

2505

memset(t, 0, n);

2504

break;

2506

break;

2505

}

2507

}

2506

*t++ = c;

2508

*t++ = c;

2507

f++;

2509

f++;

2508

n--;

2510

n--;

2509

}

2511

}

2510

return n;

2512

return n;

2511

}

2513

}

2512

2514

2513

int copy_mount_options(const void __user * data, unsigned long *where)

2515

int copy_mount_options(const void __user * data, unsigned long *where)

2514

{

2516

{

2515

int i;

2517

int i;

2516

unsigned long page;

2518

unsigned long page;

2517

unsigned long size;

2519

unsigned long size;

2518

2520

2519

*where = 0;

2521

*where = 0;

2520

if (!data)

2522

if (!data)

2521

return 0;

2523

return 0;

2522

2524

2523

if (!(page = __get_free_page(GFP_KERNEL)))

2525

if (!(page = __get_free_page(GFP_KERNEL)))

2524

return -ENOMEM;

2526

return -ENOMEM;

2525

2527

2526

/* We only care that *some* data at the address the user

2528

/* We only care that *some* data at the address the user

2527

* gave us is valid. Just in case, we'll zero

2529

* gave us is valid. Just in case, we'll zero

2528

* the remainder of the page.

2530

* the remainder of the page.

2529

*/

2531

*/

2530

/* copy_from_user cannot cross TASK_SIZE ! */

2532

/* copy_from_user cannot cross TASK_SIZE ! */

2531

size = TASK_SIZE - (unsigned long)data;

2533

size = TASK_SIZE - (unsigned long)data;

2532

if (size > PAGE_SIZE)

2534

if (size > PAGE_SIZE)

2533

size = PAGE_SIZE;

2535

size = PAGE_SIZE;

2534

2536

2535

i = size - exact_copy_from_user((void *)page, data, size);

2537

i = size - exact_copy_from_user((void *)page, data, size);

2536

if (!i) {

2538

if (!i) {

2537

free_page(page);

2539

free_page(page);

2538

return -EFAULT;

2540

return -EFAULT;

2539

}

2541

}

2540

if (i != PAGE_SIZE)

2542

if (i != PAGE_SIZE)

2541

memset((char *)page + i, 0, PAGE_SIZE - i);

2543

memset((char *)page + i, 0, PAGE_SIZE - i);

2542

*where = page;

2544

*where = page;

2543

return 0;

2545

return 0;

2544

}

2546

}

2545

2547

2546

char *copy_mount_string(const void __user *data)

2548

char *copy_mount_string(const void __user *data)

2547

{

2549

{

2548

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2550

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2549

}

2551

}

2550

2552

2551

/*

2553

/*

2552

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2554

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2553

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2555

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2554

*

2556

*

2555

* data is a (void *) that can point to any structure up to

2557

* data is a (void *) that can point to any structure up to

2556

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2558

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2557

* information (or be NULL).

2559

* information (or be NULL).

2558

*

2560

*

2559

* Pre-0.97 versions of mount() didn't have a flags word.

2561

* Pre-0.97 versions of mount() didn't have a flags word.

2560

* When the flags word was introduced its top half was required

2562

* When the flags word was introduced its top half was required

2561

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2563

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2562

* Therefore, if this magic number is present, it carries no information

2564

* Therefore, if this magic number is present, it carries no information

2563

* and must be discarded.

2565

* and must be discarded.

2564

*/

2566

*/

2565

long do_mount(const char *dev_name, const char __user *dir_name,

2567

long do_mount(const char *dev_name, const char __user *dir_name,

2566

const char *type_page, unsigned long flags, void *data_page)

2568

const char *type_page, unsigned long flags, void *data_page)

2567

{

2569

{

2568

struct path path;

2570

struct path path;

2569

int retval = 0;

2571

int retval = 0;

2570

int mnt_flags = 0;

2572

int mnt_flags = 0;

2571

2573

2572

/* Discard magic */

2574

/* Discard magic */

2573

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2575

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2574

flags &= ~MS_MGC_MSK;

2576

flags &= ~MS_MGC_MSK;

2575

2577

2576

/* Basic sanity checks */

2578

/* Basic sanity checks */

2577

if (data_page)

2579

if (data_page)

2578

((char *)data_page)[PAGE_SIZE - 1] = 0;

2580

((char *)data_page)[PAGE_SIZE - 1] = 0;

2579

2581

2580

/* ... and get the mountpoint */

2582

/* ... and get the mountpoint */

2581

retval = user_path(dir_name, &path);

2583

retval = user_path(dir_name, &path);

2582

if (retval)

2584

if (retval)

2583

return retval;

2585

return retval;

2584

2586

2585

retval = security_sb_mount(dev_name, &path,

2587

retval = security_sb_mount(dev_name, &path,

2586

type_page, flags, data_page);

2588

type_page, flags, data_page);

2587

if (!retval && !may_mount())

2589

if (!retval && !may_mount())

2588

retval = -EPERM;

2590

retval = -EPERM;

2589

if (retval)

2591

if (retval)

2590

goto dput_out;

2592

goto dput_out;

2591

2593

2592

/* Default to relatime unless overriden */

2594

/* Default to relatime unless overriden */

2593

if (!(flags & MS_NOATIME))

2595

if (!(flags & MS_NOATIME))

2594

mnt_flags |= MNT_RELATIME;

2596

mnt_flags |= MNT_RELATIME;

2595

2597

2596

/* Separate the per-mountpoint flags */

2598

/* Separate the per-mountpoint flags */

2597

if (flags & MS_NOSUID)

2599

if (flags & MS_NOSUID)

2598

mnt_flags |= MNT_NOSUID;

2600

mnt_flags |= MNT_NOSUID;

2599

if (flags & MS_NODEV)

2601

if (flags & MS_NODEV)

2600

mnt_flags |= MNT_NODEV;

2602

mnt_flags |= MNT_NODEV;

2601

if (flags & MS_NOEXEC)

2603

if (flags & MS_NOEXEC)

2602

mnt_flags |= MNT_NOEXEC;

2604

mnt_flags |= MNT_NOEXEC;

2603

if (flags & MS_NOATIME)

2605

if (flags & MS_NOATIME)

2604

mnt_flags |= MNT_NOATIME;

2606

mnt_flags |= MNT_NOATIME;

2605

if (flags & MS_NODIRATIME)

2607

if (flags & MS_NODIRATIME)

2606

mnt_flags |= MNT_NODIRATIME;

2608

mnt_flags |= MNT_NODIRATIME;

2607

if (flags & MS_STRICTATIME)

2609

if (flags & MS_STRICTATIME)

2608

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2610

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2609

if (flags & MS_RDONLY)

2611

if (flags & MS_RDONLY)

2610

mnt_flags |= MNT_READONLY;

2612

mnt_flags |= MNT_READONLY;

2611

2613

2612

/* The default atime for remount is preservation */

2614

/* The default atime for remount is preservation */

2613

if ((flags & MS_REMOUNT) &&

2615

if ((flags & MS_REMOUNT) &&

2614

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2616

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2615

MS_STRICTATIME)) == 0)) {

2617

MS_STRICTATIME)) == 0)) {

2616

mnt_flags &= ~MNT_ATIME_MASK;

2618

mnt_flags &= ~MNT_ATIME_MASK;

2617

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2619

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2618

}

2620

}

2619

2621

2620

2622

2621

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2623

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2622

MS_STRICTATIME);

2624

MS_STRICTATIME);

2623

2625

2624

if (flags & MS_REMOUNT)

2626

if (flags & MS_REMOUNT)

2625

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2627

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2626

data_page);

2628

data_page);

2627

else if (flags & MS_BIND)

2629

else if (flags & MS_BIND)

2628

retval = do_loopback(&path, dev_name, flags & MS_REC);

2630

retval = do_loopback(&path, dev_name, flags & MS_REC);

2629

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2631

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2630

retval = do_change_type(&path, flags);

2632

retval = do_change_type(&path, flags);

2631

else if (flags & MS_MOVE)

2633

else if (flags & MS_MOVE)

2632

retval = do_move_mount(&path, dev_name);

2634

retval = do_move_mount(&path, dev_name);

2633

else

2635

else

2634

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2636

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2635

dev_name, data_page);

2637

dev_name, data_page);

2636

dput_out:

2638

dput_out:

2637

path_put(&path);

2639

path_put(&path);

2638

return retval;

2640

return retval;

2639

}

2641

}

2640

2642

2641

static void free_mnt_ns(struct mnt_namespace *ns)

2643

static void free_mnt_ns(struct mnt_namespace *ns)

2642

{

2644

{

2643

proc_free_inum(ns->proc_inum);

2645

proc_free_inum(ns->proc_inum);

2644

put_user_ns(ns->user_ns);

2646

put_user_ns(ns->user_ns);

2645

kfree(ns);

2647

kfree(ns);

2646

}

2648

}

2647

2649

2648

/*

2650

/*

2649

* Assign a sequence number so we can detect when we attempt to bind

2651

* Assign a sequence number so we can detect when we attempt to bind

2650

* mount a reference to an older mount namespace into the current

2652

* mount a reference to an older mount namespace into the current

2651

* mount namespace, preventing reference counting loops. A 64bit

2653

* mount namespace, preventing reference counting loops. A 64bit

2652

* number incrementing at 10Ghz will take 12,427 years to wrap which

2654

* number incrementing at 10Ghz will take 12,427 years to wrap which

2653

* is effectively never, so we can ignore the possibility.

2655

* is effectively never, so we can ignore the possibility.

2654

*/

2656

*/

2655

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2657

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2656

2658

2657

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2659

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2658

{

2660

{

2659

struct mnt_namespace *new_ns;

2661

struct mnt_namespace *new_ns;

2660

int ret;

2662

int ret;

2661

2663

2662

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2664

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2663

if (!new_ns)

2665

if (!new_ns)

2664

return ERR_PTR(-ENOMEM);

2666

return ERR_PTR(-ENOMEM);

2665

ret = proc_alloc_inum(&new_ns->proc_inum);

2667

ret = proc_alloc_inum(&new_ns->proc_inum);

2666

if (ret) {

2668

if (ret) {

2667

kfree(new_ns);

2669

kfree(new_ns);

2668

return ERR_PTR(ret);

2670

return ERR_PTR(ret);

2669

}

2671

}

2670

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2672

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2671

atomic_set(&new_ns->count, 1);

2673

atomic_set(&new_ns->count, 1);

2672

new_ns->root = NULL;

2674

new_ns->root = NULL;

2673

INIT_LIST_HEAD(&new_ns->list);

2675

INIT_LIST_HEAD(&new_ns->list);

2674

init_waitqueue_head(&new_ns->poll);

2676

init_waitqueue_head(&new_ns->poll);

2675

new_ns->event = 0;

2677

new_ns->event = 0;

2676

new_ns->user_ns = get_user_ns(user_ns);

2678

new_ns->user_ns = get_user_ns(user_ns);

2677

return new_ns;

2679

return new_ns;

2678

}

2680

}

2679

2681

2680

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2682

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2681

struct user_namespace *user_ns, struct fs_struct *new_fs)

2683

struct user_namespace *user_ns, struct fs_struct *new_fs)

2682

{

2684

{

2683

struct mnt_namespace *new_ns;

2685

struct mnt_namespace *new_ns;

2684

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2686

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2685

struct mount *p, *q;

2687

struct mount *p, *q;

2686

struct mount *old;

2688

struct mount *old;

2687

struct mount *new;

2689

struct mount *new;

2688

int copy_flags;

2690

int copy_flags;

2689

2691

2690

BUG_ON(!ns);

2692

BUG_ON(!ns);

2691

2693

2692

if (likely(!(flags & CLONE_NEWNS))) {

2694

if (likely(!(flags & CLONE_NEWNS))) {

2693

get_mnt_ns(ns);

2695

get_mnt_ns(ns);

2694

return ns;

2696

return ns;

2695

}

2697

}

2696

2698

2697

old = ns->root;

2699

old = ns->root;

2698

2700

2699

new_ns = alloc_mnt_ns(user_ns);

2701

new_ns = alloc_mnt_ns(user_ns);

2700

if (IS_ERR(new_ns))

2702

if (IS_ERR(new_ns))

2701

return new_ns;

2703

return new_ns;

2702

2704

2703

namespace_lock();

2705

namespace_lock();

2704

/* First pass: copy the tree topology */

2706

/* First pass: copy the tree topology */

2705

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2707

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2706

if (user_ns != ns->user_ns)

2708

if (user_ns != ns->user_ns)

2707

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2709

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2708

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2710

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2709

if (IS_ERR(new)) {

2711

if (IS_ERR(new)) {

2710

namespace_unlock();

2712

namespace_unlock();

2711

free_mnt_ns(new_ns);

2713

free_mnt_ns(new_ns);

2712

return ERR_CAST(new);

2714

return ERR_CAST(new);

2713

}

2715

}

2714

new_ns->root = new;

2716

new_ns->root = new;

2715

list_add_tail(&new_ns->list, &new->mnt_list);

2717

list_add_tail(&new_ns->list, &new->mnt_list);

2716

2718

2717

/*

2719

/*

2718

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2720

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2719

* as belonging to new namespace. We have already acquired a private

2721

* as belonging to new namespace. We have already acquired a private

2720

* fs_struct, so tsk->fs->lock is not needed.

2722

* fs_struct, so tsk->fs->lock is not needed.

2721

*/

2723

*/

2722

p = old;

2724

p = old;

2723

q = new;

2725

q = new;

2724

while (p) {

2726

while (p) {

2725

q->mnt_ns = new_ns;

2727

q->mnt_ns = new_ns;

2726

if (new_fs) {

2728

if (new_fs) {

2727

if (&p->mnt == new_fs->root.mnt) {

2729

if (&p->mnt == new_fs->root.mnt) {

2728

new_fs->root.mnt = mntget(&q->mnt);

2730

new_fs->root.mnt = mntget(&q->mnt);

2729

rootmnt = &p->mnt;

2731

rootmnt = &p->mnt;

2730

}

2732

}

2731

if (&p->mnt == new_fs->pwd.mnt) {

2733

if (&p->mnt == new_fs->pwd.mnt) {

2732

new_fs->pwd.mnt = mntget(&q->mnt);

2734

new_fs->pwd.mnt = mntget(&q->mnt);

2733

pwdmnt = &p->mnt;

2735

pwdmnt = &p->mnt;

2734

}

2736

}

2735

}

2737

}

2736

p = next_mnt(p, old);

2738

p = next_mnt(p, old);

2737

q = next_mnt(q, new);

2739

q = next_mnt(q, new);

2738

if (!q)

2740

if (!q)

2739

break;

2741

break;

2740

while (p->mnt.mnt_root != q->mnt.mnt_root)

2742

while (p->mnt.mnt_root != q->mnt.mnt_root)

2741

p = next_mnt(p, old);

2743

p = next_mnt(p, old);

2742

}

2744

}

2743

namespace_unlock();

2745

namespace_unlock();

2744

2746

2745

if (rootmnt)

2747

if (rootmnt)

2746

mntput(rootmnt);

2748

mntput(rootmnt);

2747

if (pwdmnt)

2749

if (pwdmnt)

2748

mntput(pwdmnt);

2750

mntput(pwdmnt);

2749

2751

2750

return new_ns;

2752

return new_ns;

2751

}

2753

}

2752

2754

2753

/**

2755

/**

2754

* create_mnt_ns - creates a private namespace and adds a root filesystem

2756

* create_mnt_ns - creates a private namespace and adds a root filesystem

2755

* @mnt: pointer to the new root filesystem mountpoint

2757

* @mnt: pointer to the new root filesystem mountpoint

2756

*/

2758

*/

2757

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2759

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2758

{

2760

{

2759

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2761

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2760

if (!IS_ERR(new_ns)) {

2762

if (!IS_ERR(new_ns)) {

2761

struct mount *mnt = real_mount(m);

2763

struct mount *mnt = real_mount(m);

2762

mnt->mnt_ns = new_ns;

2764

mnt->mnt_ns = new_ns;

2763

new_ns->root = mnt;

2765

new_ns->root = mnt;

2764

list_add(&mnt->mnt_list, &new_ns->list);

2766

list_add(&mnt->mnt_list, &new_ns->list);

2765

} else {

2767

} else {

2766

mntput(m);

2768

mntput(m);

2767

}

2769

}

2768

return new_ns;

2770

return new_ns;

2769

}

2771

}

2770

2772

2771

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2773

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2772

{

2774

{

2773

struct mnt_namespace *ns;

2775

struct mnt_namespace *ns;

2774

struct super_block *s;

2776

struct super_block *s;

2775

struct path path;

2777

struct path path;

2776

int err;

2778

int err;

2777

2779

2778

ns = create_mnt_ns(mnt);

2780

ns = create_mnt_ns(mnt);

2779

if (IS_ERR(ns))

2781

if (IS_ERR(ns))

2780

return ERR_CAST(ns);

2782

return ERR_CAST(ns);

2781

2783

2782

err = vfs_path_lookup(mnt->mnt_root, mnt,

2784

err = vfs_path_lookup(mnt->mnt_root, mnt,

2783

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2785

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2784

2786

2785

put_mnt_ns(ns);

2787

put_mnt_ns(ns);

2786

2788

2787

if (err)

2789

if (err)

2788

return ERR_PTR(err);

2790

return ERR_PTR(err);

2789

2791

2790

/* trade a vfsmount reference for active sb one */

2792

/* trade a vfsmount reference for active sb one */

2791

s = path.mnt->mnt_sb;

2793

s = path.mnt->mnt_sb;

2792

atomic_inc(&s->s_active);

2794

atomic_inc(&s->s_active);

2793

mntput(path.mnt);

2795

mntput(path.mnt);

2794

/* lock the sucker */

2796

/* lock the sucker */

2795

down_write(&s->s_umount);

2797

down_write(&s->s_umount);

2796

/* ... and return the root of (sub)tree on it */

2798

/* ... and return the root of (sub)tree on it */

2797

return path.dentry;

2799

return path.dentry;

2798

}

2800

}

2799

EXPORT_SYMBOL(mount_subtree);

2801

EXPORT_SYMBOL(mount_subtree);

2800

2802

2801

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2803

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2802

char __user *, type, unsigned long, flags, void __user *, data)

2804

char __user *, type, unsigned long, flags, void __user *, data)

2803

{

2805

{

2804

int ret;

2806

int ret;

2805

char *kernel_type;

2807

char *kernel_type;

2806

char *kernel_dev;

2808

char *kernel_dev;

2807

unsigned long data_page;

2809

unsigned long data_page;

2808

2810

2809

kernel_type = copy_mount_string(type);

2811

kernel_type = copy_mount_string(type);

2810

ret = PTR_ERR(kernel_type);

2812

ret = PTR_ERR(kernel_type);

2811

if (IS_ERR(kernel_type))

2813

if (IS_ERR(kernel_type))

2812

goto out_type;

2814

goto out_type;

2813

2815

2814

kernel_dev = copy_mount_string(dev_name);

2816

kernel_dev = copy_mount_string(dev_name);

2815

ret = PTR_ERR(kernel_dev);

2817

ret = PTR_ERR(kernel_dev);

2816

if (IS_ERR(kernel_dev))

2818

if (IS_ERR(kernel_dev))

2817

goto out_dev;

2819

goto out_dev;

2818

2820

2819

ret = copy_mount_options(data, &data_page);

2821

ret = copy_mount_options(data, &data_page);

2820

if (ret < 0)

2822

if (ret < 0)

2821

goto out_data;

2823

goto out_data;

2822

2824

2823

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2825

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2824

(void *) data_page);

2826

(void *) data_page);

2825

2827

2826

free_page(data_page);

2828

free_page(data_page);

2827

out_data:

2829

out_data:

2828

kfree(kernel_dev);

2830

kfree(kernel_dev);

2829

out_dev:

2831

out_dev:

2830

kfree(kernel_type);

2832

kfree(kernel_type);

2831

out_type:

2833

out_type:

2832

return ret;

2834

return ret;

2833

}

2835

}

2834

2836

2835

/*

2837

/*

2836

* Return true if path is reachable from root

2838

* Return true if path is reachable from root

2837

*

2839

*

2838

* namespace_sem or mount_lock is held

2840

* namespace_sem or mount_lock is held

2839

*/

2841

*/

2840

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2842

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2841

const struct path *root)

2843

const struct path *root)

2842

{

2844

{

2843

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2845

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2844

dentry = mnt->mnt_mountpoint;

2846

dentry = mnt->mnt_mountpoint;

2845

mnt = mnt->mnt_parent;

2847

mnt = mnt->mnt_parent;

2846

}

2848

}

2847

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2849

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2848

}

2850

}

2849

2851

2850

int path_is_under(struct path *path1, struct path *path2)

2852

int path_is_under(struct path *path1, struct path *path2)

2851

{

2853

{

2852

int res;

2854

int res;

2853

read_seqlock_excl(&mount_lock);

2855

read_seqlock_excl(&mount_lock);

2854

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2856

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2855

read_sequnlock_excl(&mount_lock);

2857

read_sequnlock_excl(&mount_lock);

2856

return res;

2858

return res;

2857

}

2859

}

2858

EXPORT_SYMBOL(path_is_under);

2860

EXPORT_SYMBOL(path_is_under);

2859

2861

2860

/*

2862

/*

2861

* pivot_root Semantics:

2863

* pivot_root Semantics:

2862

* Moves the root file system of the current process to the directory put_old,

2864

* Moves the root file system of the current process to the directory put_old,

2863

* makes new_root as the new root file system of the current process, and sets

2865

* makes new_root as the new root file system of the current process, and sets

2864

* root/cwd of all processes which had them on the current root to new_root.

2866

* root/cwd of all processes which had them on the current root to new_root.

2865

*

2867

*

2866

* Restrictions:

2868

* Restrictions:

2867

* The new_root and put_old must be directories, and must not be on the

2869

* The new_root and put_old must be directories, and must not be on the

2868

* same file system as the current process root. The put_old must be

2870

* same file system as the current process root. The put_old must be

2869

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2871

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2870

* pointed to by put_old must yield the same directory as new_root. No other

2872

* pointed to by put_old must yield the same directory as new_root. No other

2871

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2873

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2872

*

2874

*

2873

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2875

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2874

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2876

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2875

* in this situation.

2877

* in this situation.

2876

*

2878

*

2877

* Notes:

2879

* Notes:

2878

* - we don't move root/cwd if they are not at the root (reason: if something

2880

* - we don't move root/cwd if they are not at the root (reason: if something

2879

* cared enough to change them, it's probably wrong to force them elsewhere)

2881

* cared enough to change them, it's probably wrong to force them elsewhere)

2880

* - it's okay to pick a root that isn't the root of a file system, e.g.

2882

* - it's okay to pick a root that isn't the root of a file system, e.g.

2881

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2883

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2882

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2884

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2883

* first.

2885

* first.

2884

*/

2886

*/

2885

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2887

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2886

const char __user *, put_old)

2888

const char __user *, put_old)

2887

{

2889

{

2888

struct path new, old, parent_path, root_parent, root;

2890

struct path new, old, parent_path, root_parent, root;

2889

struct mount *new_mnt, *root_mnt, *old_mnt;

2891

struct mount *new_mnt, *root_mnt, *old_mnt;

2890

struct mountpoint *old_mp, *root_mp;

2892

struct mountpoint *old_mp, *root_mp;

2891

int error;

2893

int error;

2892

2894

2893

if (!may_mount())

2895

if (!may_mount())

2894

return -EPERM;

2896

return -EPERM;

2895

2897

2896

error = user_path_dir(new_root, &new);

2898

error = user_path_dir(new_root, &new);

2897

if (error)

2899

if (error)

2898

goto out0;

2900

goto out0;

2899

2901

2900

error = user_path_dir(put_old, &old);

2902

error = user_path_dir(put_old, &old);

2901

if (error)

2903

if (error)

2902

goto out1;

2904

goto out1;

2903

2905

2904

error = security_sb_pivotroot(&old, &new);

2906

error = security_sb_pivotroot(&old, &new);

2905

if (error)

2907

if (error)

2906

goto out2;

2908

goto out2;

2907

2909

2908

get_fs_root(current->fs, &root);

2910

get_fs_root(current->fs, &root);

2909

old_mp = lock_mount(&old);

2911

old_mp = lock_mount(&old);

2910

error = PTR_ERR(old_mp);

2912

error = PTR_ERR(old_mp);

2911

if (IS_ERR(old_mp))

2913

if (IS_ERR(old_mp))

2912

goto out3;

2914

goto out3;

2913

2915

2914

error = -EINVAL;

2916

error = -EINVAL;

2915

new_mnt = real_mount(new.mnt);

2917

new_mnt = real_mount(new.mnt);

2916

root_mnt = real_mount(root.mnt);

2918

root_mnt = real_mount(root.mnt);

2917

old_mnt = real_mount(old.mnt);

2919

old_mnt = real_mount(old.mnt);

2918

if (IS_MNT_SHARED(old_mnt) ||

2920

if (IS_MNT_SHARED(old_mnt) ||

2919

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2921

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2920

IS_MNT_SHARED(root_mnt->mnt_parent))

2922

IS_MNT_SHARED(root_mnt->mnt_parent))

2921

goto out4;

2923

goto out4;

2922

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2924

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2923

goto out4;

2925

goto out4;

2924

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2926

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2925

goto out4;

2927

goto out4;

2926

error = -ENOENT;

2928

error = -ENOENT;

2927

if (d_unlinked(new.dentry))

2929

if (d_unlinked(new.dentry))

2928

goto out4;

2930

goto out4;

2929

error = -EBUSY;

2931

error = -EBUSY;

2930

if (new_mnt == root_mnt || old_mnt == root_mnt)

2932

if (new_mnt == root_mnt || old_mnt == root_mnt)

2931

goto out4; /* loop, on the same file system */

2933

goto out4; /* loop, on the same file system */

2932

error = -EINVAL;

2934

error = -EINVAL;

2933

if (root.mnt->mnt_root != root.dentry)

2935

if (root.mnt->mnt_root != root.dentry)

2934

goto out4; /* not a mountpoint */

2936

goto out4; /* not a mountpoint */

2935

if (!mnt_has_parent(root_mnt))

2937

if (!mnt_has_parent(root_mnt))

2936

goto out4; /* not attached */

2938

goto out4; /* not attached */

2937

root_mp = root_mnt->mnt_mp;

2939

root_mp = root_mnt->mnt_mp;

2938

if (new.mnt->mnt_root != new.dentry)

2940

if (new.mnt->mnt_root != new.dentry)

2939

goto out4; /* not a mountpoint */

2941

goto out4; /* not a mountpoint */

2940

if (!mnt_has_parent(new_mnt))

2942

if (!mnt_has_parent(new_mnt))

2941

goto out4; /* not attached */

2943

goto out4; /* not attached */

2942

/* make sure we can reach put_old from new_root */

2944

/* make sure we can reach put_old from new_root */

2943

if (!is_path_reachable(old_mnt, old.dentry, &new))

2945

if (!is_path_reachable(old_mnt, old.dentry, &new))

2944

goto out4;

2946

goto out4;

2945

/* make certain new is below the root */

2947

/* make certain new is below the root */

2946

if (!is_path_reachable(new_mnt, new.dentry, &root))

2948

if (!is_path_reachable(new_mnt, new.dentry, &root))

2947

goto out4;

2949

goto out4;

2948

root_mp->m_count++; /* pin it so it won't go away */

2950

root_mp->m_count++; /* pin it so it won't go away */

2949

lock_mount_hash();

2951

lock_mount_hash();

2950

detach_mnt(new_mnt, &parent_path);

2952

detach_mnt(new_mnt, &parent_path);

2951

detach_mnt(root_mnt, &root_parent);

2953

detach_mnt(root_mnt, &root_parent);

2952

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2954

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2953

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2955

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2954

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2956

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2955

}

2957

}

2956

/* mount old root on put_old */

2958

/* mount old root on put_old */

2957

attach_mnt(root_mnt, old_mnt, old_mp);

2959

attach_mnt(root_mnt, old_mnt, old_mp);

2958

/* mount new_root on / */

2960

/* mount new_root on / */

2959

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2961

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2960

touch_mnt_namespace(current->nsproxy->mnt_ns);

2962

touch_mnt_namespace(current->nsproxy->mnt_ns);

2961

unlock_mount_hash();

2963

unlock_mount_hash();

2962

chroot_fs_refs(&root, &new);

2964

chroot_fs_refs(&root, &new);

2963

put_mountpoint(root_mp);

2965

put_mountpoint(root_mp);

2964

error = 0;

2966

error = 0;

2965

out4:

2967

out4:

2966

unlock_mount(old_mp);

2968

unlock_mount(old_mp);

2967

if (!error) {

2969

if (!error) {

2968

path_put(&root_parent);

2970

path_put(&root_parent);

2969

path_put(&parent_path);

2971

path_put(&parent_path);

2970

}

2972

}

2971

out3:

2973

out3:

2972

path_put(&root);

2974

path_put(&root);

2973

out2:

2975

out2:

2974

path_put(&old);

2976

path_put(&old);

2975

out1:

2977

out1:

2976

path_put(&new);

2978

path_put(&new);

2977

out0:

2979

out0:

2978

return error;

2980

return error;

2979

}

2981

}

2980

2982

2981

static void __init init_mount_tree(void)

2983

static void __init init_mount_tree(void)

2982

{

2984

{

2983

struct vfsmount *mnt;

2985

struct vfsmount *mnt;

2984

struct mnt_namespace *ns;

2986

struct mnt_namespace *ns;

2985

struct path root;

2987

struct path root;

2986

struct file_system_type *type;

2988

struct file_system_type *type;

2987

2989

2988

type = get_fs_type("rootfs");

2990

type = get_fs_type("rootfs");

2989

if (!type)

2991

if (!type)

2990

panic("Can't find rootfs type");

2992

panic("Can't find rootfs type");

2991

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2993

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2992

put_filesystem(type);

2994

put_filesystem(type);

2993

if (IS_ERR(mnt))

2995

if (IS_ERR(mnt))

2994

panic("Can't create rootfs");

2996

panic("Can't create rootfs");

2995

2997

2996

ns = create_mnt_ns(mnt);

2998

ns = create_mnt_ns(mnt);

2997

if (IS_ERR(ns))

2999

if (IS_ERR(ns))

2998

panic("Can't allocate initial namespace");

3000

panic("Can't allocate initial namespace");

2999

3001

3000

init_task.nsproxy->mnt_ns = ns;

3002

init_task.nsproxy->mnt_ns = ns;

3001

get_mnt_ns(ns);

3003

get_mnt_ns(ns);

3002

3004

3003

root.mnt = mnt;

3005

root.mnt = mnt;

3004

root.dentry = mnt->mnt_root;

3006

root.dentry = mnt->mnt_root;

3005

3007

3006

set_fs_pwd(current->fs, &root);

3008

set_fs_pwd(current->fs, &root);

3007

set_fs_root(current->fs, &root);

3009

set_fs_root(current->fs, &root);

3008

}

3010

}

3009

3011

3010

void __init mnt_init(void)

3012

void __init mnt_init(void)

3011

{

3013

{

3012

unsigned u;

3014

unsigned u;

3013

int err;

3015

int err;

3014

3016

3015

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3017

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3016

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3018

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3017

3019

3018

mount_hashtable = alloc_large_system_hash("Mount-cache",

3020

mount_hashtable = alloc_large_system_hash("Mount-cache",

3019

sizeof(struct hlist_head),

3021

sizeof(struct hlist_head),

3020

mhash_entries, 19,

3022

mhash_entries, 19,

3021

0,

3023

0,

3022

&m_hash_shift, &m_hash_mask, 0, 0);

3024

&m_hash_shift, &m_hash_mask, 0, 0);

3023

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3025

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3024

sizeof(struct hlist_head),

3026

sizeof(struct hlist_head),

3025

mphash_entries, 19,

3027

mphash_entries, 19,

3026

0,

3028

0,

3027

&mp_hash_shift, &mp_hash_mask, 0, 0);

3029

&mp_hash_shift, &mp_hash_mask, 0, 0);

3028

3030

3029

if (!mount_hashtable || !mountpoint_hashtable)

3031

if (!mount_hashtable || !mountpoint_hashtable)

3030

panic("Failed to allocate mount hash table\n");

3032

panic("Failed to allocate mount hash table\n");

3031

3033

3032

for (u = 0; u <= m_hash_mask; u++)

3034

for (u = 0; u <= m_hash_mask; u++)

3033

INIT_HLIST_HEAD(&mount_hashtable[u]);

3035

INIT_HLIST_HEAD(&mount_hashtable[u]);

3034

for (u = 0; u <= mp_hash_mask; u++)

3036

for (u = 0; u <= mp_hash_mask; u++)

3035

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3037

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3036

3038

3037

kernfs_init();

3039

kernfs_init();

3038

3040

3039

err = sysfs_init();

3041

err = sysfs_init();

3040

if (err)

3042

if (err)

3041

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3043

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3042

__func__, err);

3044

__func__, err);

3043

fs_kobj = kobject_create_and_add("fs", NULL);

3045

fs_kobj = kobject_create_and_add("fs", NULL);

3044

if (!fs_kobj)

3046

if (!fs_kobj)

3045

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3047

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3046

init_rootfs();

3048

init_rootfs();

3047

init_mount_tree();

3049

init_mount_tree();

3048

}

3050

}

3049

3051

3050

void put_mnt_ns(struct mnt_namespace *ns)

3052

void put_mnt_ns(struct mnt_namespace *ns)

3051

{

3053

{

3052

if (!atomic_dec_and_test(&ns->count))

3054

if (!atomic_dec_and_test(&ns->count))

3053

return;

3055

return;

3054

drop_collected_mounts(&ns->root->mnt);

3056

drop_collected_mounts(&ns->root->mnt);

3055

free_mnt_ns(ns);

3057

free_mnt_ns(ns);

3056

}

3058

}

3057

3059

3058

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3060

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3059

{

3061

{

3060

struct vfsmount *mnt;

3062

struct vfsmount *mnt;

3061

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3063

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3062

if (!IS_ERR(mnt)) {

3064

if (!IS_ERR(mnt)) {

3063

/*

3065

/*

3064

* it is a longterm mount, don't release mnt until

3066

* it is a longterm mount, don't release mnt until

3065

* we unmount before file sys is unregistered

3067

* we unmount before file sys is unregistered

3066

*/

3068

*/

3067

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3069

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3068

}

3070

}

3069

return mnt;

3071

return mnt;

3070

}

3072

}

3071

EXPORT_SYMBOL_GPL(kern_mount_data);

3073

EXPORT_SYMBOL_GPL(kern_mount_data);

3072

3074

3073

void kern_unmount(struct vfsmount *mnt)

3075

void kern_unmount(struct vfsmount *mnt)

3074

{

3076

{

3075

/* release long term mount so mount point can be released */

3077

/* release long term mount so mount point can be released */

3076

if (!IS_ERR_OR_NULL(mnt)) {

3078

if (!IS_ERR_OR_NULL(mnt)) {

3077

real_mount(mnt)->mnt_ns = NULL;

3079

real_mount(mnt)->mnt_ns = NULL;

3078

synchronize_rcu(); /* yecchhh... */

3080

synchronize_rcu(); /* yecchhh... */

3079

mntput(mnt);

3081

mntput(mnt);

3080

}

3082

}

3081

}

3083

}

3082

EXPORT_SYMBOL(kern_unmount);

3084

EXPORT_SYMBOL(kern_unmount);

3083

3085

3084

bool our_mnt(struct vfsmount *mnt)

3086

bool our_mnt(struct vfsmount *mnt)

3085

{

3087

{

3086

return check_mnt(real_mount(mnt));

3088

return check_mnt(real_mount(mnt));

3087

}

3089

}

3088

3090

3089

bool current_chrooted(void)

3091

bool current_chrooted(void)

3090

{

3092

{

3091

/* Does the current process have a non-standard root */

3093

/* Does the current process have a non-standard root */

3092

struct path ns_root;

3094

struct path ns_root;

3093

struct path fs_root;

3095

struct path fs_root;

3094

bool chrooted;

3096

bool chrooted;

3095

3097

3096

/* Find the namespace root */

3098

/* Find the namespace root */

3097

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3099

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3098

ns_root.dentry = ns_root.mnt->mnt_root;

3100

ns_root.dentry = ns_root.mnt->mnt_root;

3099

path_get(&ns_root);

3101

path_get(&ns_root);

3100

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3102

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3101

;

3103

;

3102

3104

3103

get_fs_root(current->fs, &fs_root);

3105

get_fs_root(current->fs, &fs_root);

3104

3106

3105

chrooted = !path_equal(&fs_root, &ns_root);

3107

chrooted = !path_equal(&fs_root, &ns_root);

3106

3108

3107

path_put(&fs_root);

3109

path_put(&fs_root);

3108

path_put(&ns_root);

3110

path_put(&ns_root);

3109

3111

3110

return chrooted;

3112

return chrooted;

3111

}

3113

}

3112

3114

3113

bool fs_fully_visible(struct file_system_type *type)

3115

bool fs_fully_visible(struct file_system_type *type)

3114

{

3116

{

3115

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3117

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3116

struct mount *mnt;

3118

struct mount *mnt;

3117

bool visible = false;

3119

bool visible = false;

3118

3120

3119

if (unlikely(!ns))

3121

if (unlikely(!ns))

3120

return false;

3122

return false;

3121

3123

3122

down_read(&namespace_sem);

3124

down_read(&namespace_sem);

3123

list_for_each_entry(mnt, &ns->list, mnt_list) {

3125

list_for_each_entry(mnt, &ns->list, mnt_list) {

3124

struct mount *child;

3126

struct mount *child;

3125

if (mnt->mnt.mnt_sb->s_type != type)

3127

if (mnt->mnt.mnt_sb->s_type != type)

3126

continue;

3128

continue;

3127

3129

3128

/* This mount is not fully visible if there are any child mounts

3130

/* This mount is not fully visible if there are any child mounts

3129

* that cover anything except for empty directories.

3131

* that cover anything except for empty directories.

3130

*/

3132

*/

3131

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3133

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3132

struct inode *inode = child->mnt_mountpoint->d_inode;

3134

struct inode *inode = child->mnt_mountpoint->d_inode;

3133

if (!S_ISDIR(inode->i_mode))

3135

if (!S_ISDIR(inode->i_mode))

3134

goto next;

3136

goto next;

3135

if (inode->i_nlink > 2)

3137

if (inode->i_nlink > 2)

3136

goto next;

3138

goto next;

3137

}

3139

}

3138

visible = true;

3140

visible = true;

3139

goto found;

3141

goto found;

3140

next: ;

3142

next: ;

3141

}

3143

}

3142

found:

3144

found:

3143

up_read(&namespace_sem);

3145

up_read(&namespace_sem);

3144

return visible;

3146

return visible;

3145

}

3147

}

3146

3148

3147

static void *mntns_get(struct task_struct *task)

3149

static void *mntns_get(struct task_struct *task)

3148

{

3150

{

3149

struct mnt_namespace *ns = NULL;

3151

struct mnt_namespace *ns = NULL;

3150

struct nsproxy *nsproxy;

3152

struct nsproxy *nsproxy;

3151

3153

3152

task_lock(task);

3154

task_lock(task);

3153

nsproxy = task->nsproxy;

3155

nsproxy = task->nsproxy;

3154

if (nsproxy) {

3156

if (nsproxy) {

3155

ns = nsproxy->mnt_ns;

3157

ns = nsproxy->mnt_ns;

3156

get_mnt_ns(ns);

3158

get_mnt_ns(ns);

3157

}

3159

}

3158

task_unlock(task);

3160

task_unlock(task);

3159

3161

3160

return ns;

3162

return ns;

3161

}

3163

}

3162

3164

3163

static void mntns_put(void *ns)

3165

static void mntns_put(void *ns)

3164

{

3166

{

3165

put_mnt_ns(ns);

3167

put_mnt_ns(ns);

3166

}

3168

}

3167

3169

3168

static int mntns_install(struct nsproxy *nsproxy, void *ns)

3170

static int mntns_install(struct nsproxy *nsproxy, void *ns)

3169

{

3171

{

3170

struct fs_struct *fs = current->fs;

3172

struct fs_struct *fs = current->fs;

3171

struct mnt_namespace *mnt_ns = ns;

3173

struct mnt_namespace *mnt_ns = ns;

3172

struct path root;

3174

struct path root;

3173

3175

3174

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3176

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3175

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3177

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3176

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3178

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3177

return -EPERM;

3179

return -EPERM;

3178

3180

3179

if (fs->users != 1)

3181

if (fs->users != 1)

3180

return -EINVAL;

3182

return -EINVAL;

3181

3183

3182

get_mnt_ns(mnt_ns);

3184

get_mnt_ns(mnt_ns);

3183

put_mnt_ns(nsproxy->mnt_ns);

3185

put_mnt_ns(nsproxy->mnt_ns);

3184

nsproxy->mnt_ns = mnt_ns;

3186

nsproxy->mnt_ns = mnt_ns;

3185

3187

3186

/* Find the root */

3188

/* Find the root */

3187

root.mnt = &mnt_ns->root->mnt;

3189

root.mnt = &mnt_ns->root->mnt;

3188

root.dentry = mnt_ns->root->mnt.mnt_root;

3190

root.dentry = mnt_ns->root->mnt.mnt_root;

3189

path_get(&root);

3191

path_get(&root);

3190

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3192

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3191

;

3193

;

3192

3194

3193

/* Update the pwd and root */

3195

/* Update the pwd and root */

3194

set_fs_pwd(fs, &root);

3196

set_fs_pwd(fs, &root);

3195

set_fs_root(fs, &root);

3197

set_fs_root(fs, &root);

3196

3198

3197

path_put(&root);

3199

path_put(&root);

3198

return 0;

3200

return 0;

3199

}

3201

}

3200

3202

3201

static unsigned int mntns_inum(void *ns)

3203

static unsigned int mntns_inum(void *ns)

3202

{

3204

{

3203

struct mnt_namespace *mnt_ns = ns;

3205

struct mnt_namespace *mnt_ns = ns;

3204

return mnt_ns->proc_inum;

3206

return mnt_ns->proc_inum;

3205

}

3207

}

3206

3208

3207

const struct proc_ns_operations mntns_operations = {

3209

const struct proc_ns_operations mntns_operations = {

3208

.name = "mnt",

3210

.name = "mnt",

3209

.type = CLONE_NEWNS,

3211

.type = CLONE_NEWNS,

3210

.get = mntns_get,

3212

.get = mntns_get,

3211

.put = mntns_put,

3213

.put = mntns_put,

3212

.install = mntns_install,

3214

.install = mntns_install,

3213

.inum = mntns_inum,

3215

.inum = mntns_inum,

3214

};

3216

};

3215

3217

GITLAB

mnt: Fix a memory stomp in umount

 /*
  *  linux/fs/namespace.c
  *
  * (C) Copyright Al Viro 2000, 2001
  *	Released under GPL v2.
  *
  * Based on code from fs/super.c, copyright Linus Torvalds and others.
  * Heavily rewritten.
  */
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
 #include "pnode.h"
 #include "internal.h"
 static unsigned int m_hash_mask __read_mostly;
 static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
 static unsigned int mp_hash_shift __read_mostly;
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mhash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mhash_entries=", set_mhash_entries);
 static __initdata unsigned long mphash_entries;
 static int __init set_mphash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mphash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mphash_entries=", set_mphash_entries);
 static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 /*
  * vfsmount lock may be taken for read to prevent changes to the
  * vfsmount hash, ie. during mountpoint lookups or walking back
  * up the tree.
  *
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
 	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> m_hash_shift);
 	return &mount_hashtable[tmp & m_hash_mask];
 }
 static inline struct hlist_head *mp_hash(struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> mp_hash_shift);
 	return &mountpoint_hashtable[tmp & mp_hash_mask];
 }
 /*
  * allocation is serialized by namespace_sem, but we need the spinlock to
  * serialize with freeing.
  */
 static int mnt_alloc_id(struct mount *mnt)
 {
 	int res;
 retry:
 	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
 	spin_lock(&mnt_id_lock);
 	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
 	if (!res)
 		mnt_id_start = mnt->mnt_id + 1;
 	spin_unlock(&mnt_id_lock);
 	if (res == -EAGAIN)
 		goto retry;
 	return res;
 }
 static void mnt_free_id(struct mount *mnt)
 {
 	int id = mnt->mnt_id;
 	spin_lock(&mnt_id_lock);
 	ida_remove(&mnt_id_ida, id);
 	if (mnt_id_start > id)
 		mnt_id_start = id;
 	spin_unlock(&mnt_id_lock);
 }
 /*
  * Allocate a new peer group ID
  *
  * mnt_group_ida is protected by namespace_sem
  */
 static int mnt_alloc_group_id(struct mount *mnt)
 {
 	int res;
 	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
 		return -ENOMEM;
 	res = ida_get_new_above(&mnt_group_ida,
 				mnt_group_start,
 				&mnt->mnt_group_id);
 	if (!res)
 		mnt_group_start = mnt->mnt_group_id + 1;
 	return res;
 }
 /*
  * Release a peer group ID
  */
 void mnt_release_group_id(struct mount *mnt)
 {
 	int id = mnt->mnt_group_id;
 	ida_remove(&mnt_group_ida, id);
 	if (mnt_group_start > id)
 		mnt_group_start = id;
 	mnt->mnt_group_id = 0;
 }
 /*
  * vfsmount lock must be held for read
  */
 static inline void mnt_add_count(struct mount *mnt, int n)
 {
 #ifdef CONFIG_SMP
 	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
 #else
 	preempt_disable();
 	mnt->mnt_count += n;
 	preempt_enable();
 #endif
 }
 /*
  * vfsmount lock must be held for write
  */
 unsigned int mnt_get_count(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
 	}
 	return count;
 #else
 	return mnt->mnt_count;
 #endif
 }
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 		err = mnt_alloc_id(mnt);
 		if (err)
 			goto out_free_cache;
 		if (name) {
 			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
 #ifdef CONFIG_SMP
 		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
 #endif
 		INIT_HLIST_NODE(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
 	}
 	return mnt;
 #ifdef CONFIG_SMP
 out_free_devname:
 	kfree(mnt->mnt_devname);
 #endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
 	return NULL;
 }
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
  * We must keep track of when those operations start
  * (for permission checks) and when they end, so that
  * we can determine when writes are able to occur to
  * a filesystem.
  */
 /*
  * __mnt_is_readonly: check whether a mount is read-only
  * @mnt: the mount to check for its write status
  *
  * This shouldn't be used directly ouside of the VFS.
  * It does not guarantee that the filesystem will stay
  * r/w, just that it is right *now*.  This can not and
  * should not be used in place of IS_RDONLY(inode).
  * mnt_want/drop_write() will _keep_ the filesystem
  * r/w.
  */
 int __mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_flags & MNT_READONLY)
 		return 1;
 	if (mnt->mnt_sb->s_flags & MS_RDONLY)
 		return 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 static inline void mnt_inc_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers++;
 #endif
 }
 static inline void mnt_dec_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers--;
 #endif
 }
 static unsigned int mnt_get_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
 	}
 	return count;
 #else
 	return mnt->mnt_writers;
 #endif
 }
 static int mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_sb->s_readonly_remount)
 		return 1;
 	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
 	smp_rmb();
 	return __mnt_is_readonly(mnt);
 }
 /*
  * Most r/o & frozen checks on a fs are for operations that take discrete
  * amounts of time, like a write() or unlink().  We must keep track of when
  * those operations start (for permission checks) and when they end, so that we
  * can determine when writes are able to occur to a filesystem.
  */
 /**
  * __mnt_want_write - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
  * frozen. When the write operation is finished, __mnt_drop_write() must be
  * called. This is effectively a refcount.
  */
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
 	preempt_disable();
 	mnt_inc_writers(mnt);
 	/*
 	 * The store to mnt_inc_writers must be visible before we pass
 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
 	if (mnt_is_readonly(m)) {
 		mnt_dec_writers(mnt);
 		ret = -EROFS;
 	}
 	preempt_enable();
 	return ret;
 }
 /**
  * mnt_want_write - get write access to a mount
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mount is read-write, filesystem
  * is not frozen) before returning success.  When the write operation is
  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
  */
 int mnt_want_write(struct vfsmount *m)
 {
 	int ret;
 	sb_start_write(m->mnt_sb);
 	ret = __mnt_want_write(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 /**
  * mnt_clone_write - get write access to a mount
  * @mnt: the mount on which to take a write
  *
  * This is effectively like mnt_want_write, except
  * it must only be used to take an extra write reference
  * on a mountpoint that we already know has a write reference
  * on it. This allows some optimisation.
  *
  * After finished, mnt_drop_write must be called as usual to
  * drop the reference.
  */
 int mnt_clone_write(struct vfsmount *mnt)
 {
 	/* superblock may be r/o */
 	if (__mnt_is_readonly(mnt))
 		return -EROFS;
 	preempt_disable();
 	mnt_inc_writers(real_mount(mnt));
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mnt_clone_write);
 /**
  * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like __mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int __mnt_want_write_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
 		return __mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
 }
 /**
  * mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int mnt_want_write_file(struct file *file)
 {
 	int ret;
 	sb_start_write(file->f_path.mnt->mnt_sb);
 	ret = __mnt_want_write_file(file);
 	if (ret)
 		sb_end_write(file->f_path.mnt->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
  * __mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
  * __mnt_want_write() call above.
  */
 void __mnt_drop_write(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done performing writes to it and
  * also allows filesystem to be frozen again.  Must be matched with
  * mnt_want_write() call above.
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
 	__mnt_drop_write(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 void __mnt_drop_write_file(struct file *file)
 {
 	__mnt_drop_write(file->f_path.mnt);
 }
 void mnt_drop_write_file(struct file *file)
 {
 	mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 static int mnt_make_readonly(struct mount *mnt)
 {
 	int ret = 0;
 	lock_mount_hash();
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
 	 * should be visible before we do.
 	 */
 	smp_mb();
 	/*
 	 * With writers on hold, if this value is zero, then there are
 	 * definitely no active writers (although held writers may subsequently
 	 * increment the count, they'll have to wait, and decrement it after
 	 * seeing MNT_READONLY).
 	 *
 	 * It is OK to have counter incremented on one CPU and decremented on
 	 * another: the sum will add up correctly. The danger would be when we
 	 * sum up each counter, if we read a counter before it is incremented,
 	 * but then read another CPU's count which it has been subsequently
 	 * decremented from -- we would see more decrements than we should.
 	 * MNT_WRITE_HOLD protects against this scenario, because
 	 * mnt_want_write first increments count, then smp_mb, then spins on
 	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
 	 * we're counting up here.
 	 */
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
 	/*
 	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 	 * that become unheld will see MNT_READONLY.
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	unlock_mount_hash();
 	return ret;
 }
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
 }
 int sb_prepare_remount_readonly(struct super_block *sb)
 {
 	struct mount *mnt;
 	int err = 0;
 	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 	lock_mount_hash();
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
 			}
 		}
 	}
 	if (!err && atomic_long_read(&sb->s_remove_count))
 		err = -EBUSY;
 	if (!err) {
 		sb->s_readonly_remount = 1;
 		smp_wmb();
 	}
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
 	unlock_mount_hash();
 	return err;
 }
 static void free_vfsmnt(struct mount *mnt)
 {
 	kfree(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
 #endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 static void delayed_free_vfsmnt(struct rcu_head *head)
 {
 	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
 }
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
 	struct mount *mnt;
 	if (read_seqretry(&mount_lock, seq))
 		return false;
 	if (bastard == NULL)
 		return true;
 	mnt = real_mount(bastard);
 	mnt_add_count(mnt, 1);
 	if (likely(!read_seqretry(&mount_lock, seq)))
 		return true;
 	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
 		mnt_add_count(mnt, -1);
 		return false;
 	}
 	rcu_read_unlock();
 	mntput(bastard);
 	rcu_read_lock();
 	return false;
 }
 /*
  * find the first mount at @dentry on vfsmount @mnt.
  * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct hlist_head *head = m_hash(mnt, dentry);
 	struct mount *p;
 	hlist_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
 }
 /*
  * find the last mount at @dentry on vfsmount @mnt.
  * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct mount *p, *res;
 	res = p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
 		res = p;
 	}
 out:
 	return res;
 }
 /*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
  * following mounts:
  *
  * mount /dev/sda1 /mnt
  * mount /dev/sda2 /mnt
  * mount /dev/sda3 /mnt
  *
  * Then lookup_mnt() on the base /mnt dentry in the root mount will
  * return successively the root dentry and vfsmount of /dev/sda1, then
  * /dev/sda2, then /dev/sda3, then NULL.
  *
  * lookup_mnt takes a reference to the found vfsmount.
  */
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
 	struct vfsmount *m;
 	unsigned seq;
 	rcu_read_lock();
 	do {
 		seq = read_seqbegin(&mount_lock);
 		child_mnt = __lookup_mnt(path->mnt, path->dentry);
 		m = child_mnt ? &child_mnt->mnt : NULL;
 	} while (!legitimize_mnt(m, seq));
 	rcu_read_unlock();
 	return m;
 }
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
  *
  * The common case is dentries are not mountpoints at all and that
  * test is handled inline.  For the slow case when we are actually
  * dealing with a mountpoint of some kind, walk through all of the
  * mounts in the current mount namespace and test to see if the dentry
  * is a mountpoint.
  *
  * The mount_hashtable is not usable in the context because we
  * need to identify all mounts that may be in the current mount
  * namespace not just a mount that happens to have some specified
  * parent mount.
  */
 bool __is_local_mountpoint(struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool is_covered = false;
 	if (!d_mountpoint(dentry))
 		goto out;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
 	up_read(&namespace_sem);
 out:
 	return is_covered;
 }
 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	hlist_for_each_entry(mp, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
 			/* might be worth a WARN_ON() */
 			if (d_unlinked(dentry))
 				return ERR_PTR(-ENOENT);
 			mp->m_count++;
 			return mp;
 		}
 	}
 	return NULL;
 }
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	int ret;
 	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 	ret = d_set_mounted(dentry);
 	if (ret) {
 		kfree(mp);
 		return ERR_PTR(ret);
 	}
 	mp->m_dentry = dentry;
 	mp->m_count = 1;
 	hlist_add_head(&mp->m_hash, chain);
 	INIT_HLIST_HEAD(&mp->m_list);
 	return mp;
 }
 static void put_mountpoint(struct mountpoint *mp)
 {
 	if (!--mp->m_count) {
 		struct dentry *dentry = mp->m_dentry;
 		BUG_ON(!hlist_empty(&mp->m_list));
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
 		hlist_del(&mp->m_hash);
 		kfree(mp);
 	}
 }
 static inline int check_mnt(struct mount *mnt)
 {
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 /*
  * vfsmount lock must be held for write
  */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns) {
 		ns->event = ++event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns && ns->event != event) {
 		ns->event = event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void detach_mnt(struct mount *mnt, struct path *old_path)
 {
 	old_path->dentry = mnt->mnt_mountpoint;
 	old_path->mnt = &mnt->mnt_parent->mnt;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init_rcu(&mnt->mnt_hash);
 	hlist_del_init(&mnt->mnt_mp_list);
 	put_mountpoint(mnt->mnt_mp);
 	mnt->mnt_mp = NULL;
 }
 /*
  * vfsmount lock must be held for write
  */
 void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
 	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 /*
  * vfsmount lock must be held for write
  */
 static void attach_mnt(struct mount *mnt,
 			struct mount *parent,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
 	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 static void attach_shadowed(struct mount *mnt,
 			struct mount *parent,
 			struct mount *shadows)
 {
 	if (shadows) {
 		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
 		list_add(&mnt->mnt_child, &shadows->mnt_child);
 	} else {
 		hlist_add_head_rcu(&mnt->mnt_hash,
 				m_hash(&parent->mnt, mnt->mnt_mountpoint));
 		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void commit_tree(struct mount *mnt, struct mount *shadows)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
 	LIST_HEAD(head);
 	struct mnt_namespace *n = parent->mnt_ns;
 	BUG_ON(parent == mnt);
 	list_add_tail(&head, &mnt->mnt_list);
 	list_for_each_entry(m, &head, mnt_list)
 		m->mnt_ns = n;
 	list_splice(&head, n->list.prev);
 	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
 static struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
 	if (next == &p->mnt_mounts) {
 		while (1) {
 			if (p == root)
 				return NULL;
 			next = p->mnt_child.next;
 			if (next != &p->mnt_parent->mnt_mounts)
 				break;
 			p = p->mnt_parent;
 		}
 	}
 	return list_entry(next, struct mount, mnt_child);
 }
 static struct mount *skip_mnt_tree(struct mount *p)
 {
 	struct list_head *prev = p->mnt_mounts.prev;
 	while (prev != &p->mnt_mounts) {
 		p = list_entry(prev, struct mount, mnt_child);
 		prev = p->mnt_mounts.prev;
 	}
 	return p;
 }
 struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
 	struct mount *mnt;
 	struct dentry *root;
 	if (!type)
 		return ERR_PTR(-ENODEV);
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flags & MS_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 	root = mount_fs(type, flags, name, data);
 	if (IS_ERR(root)) {
 		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
 	}
 	mnt->mnt.mnt_root = root;
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
 	unlock_mount_hash();
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
 	struct super_block *sb = old->mnt.mnt_sb;
 	struct mount *mnt;
 	int err;
 	mnt = alloc_vfsmnt(old->mnt_devname);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
 	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
 		err = mnt_alloc_group_id(mnt);
 		if (err)
 			goto out_free;
 	}
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
 	/* Don't allow unprivileged users to change mount flags */
 	if (flag & CL_UNPRIVILEGED) {
 		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
 		if (mnt->mnt.mnt_flags & MNT_READONLY)
 			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 		if (mnt->mnt.mnt_flags & MNT_NODEV)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
 		if (mnt->mnt.mnt_flags & MNT_NOSUID)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
 		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
 	}
 	/* Don't allow unprivileged users to reveal what is under a mount */
 	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
 		mnt->mnt.mnt_flags |= MNT_LOCKED;
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	unlock_mount_hash();
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 		CLEAR_MNT_SHARED(mnt);
 	} else if (!(flag & CL_PRIVATE)) {
 		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
 			list_add(&mnt->mnt_share, &old->mnt_share);
 		if (IS_MNT_SLAVE(old))
 			list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
 	if (flag & CL_MAKE_SHARED)
 		set_mnt_shared(mnt);
 	/* stick the duplicate mount on the same expiry list
 	 * as the original if that was on one */
 	if (flag & CL_EXPIRE) {
 		if (!list_empty(&old->mnt_expire))
 			list_add(&mnt->mnt_expire, &old->mnt_expire);
 	}
 	return mnt;
  out_free:
 	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);
 }
 static void cleanup_mnt(struct mount *mnt)
 {
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
 	/*
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
 	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 static void __cleanup_mnt(struct rcu_head *head)
 {
 	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
 }
 static LLIST_HEAD(delayed_mntput_list);
 static void delayed_mntput(struct work_struct *unused)
 {
 	struct llist_node *node = llist_del_all(&delayed_mntput_list);
 	struct llist_node *next;
 	for (; node; node = next) {
 		next = llist_next(node);
 		cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
 	}
 }
 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 static void mntput_no_expire(struct mount *mnt)
 {
 	rcu_read_lock();
 	mnt_add_count(mnt, -1);
 	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
 		rcu_read_unlock();
 		return;
 	}
 	lock_mount_hash();
 	if (mnt_get_count(mnt)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	mnt->mnt.mnt_flags |= MNT_DOOMED;
 	rcu_read_unlock();
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
 		struct task_struct *task = current;
 		if (likely(!(task->flags & PF_KTHREAD))) {
 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
 			if (!task_work_add(task, &mnt->mnt_rcu, true))
 				return;
 		}
 		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
 			schedule_delayed_work(&delayed_mntput_work, 1);
 		return;
 	}
 	cleanup_mnt(mnt);
 }
 void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(m->mnt_expiry_mark))
 			m->mnt_expiry_mark = 0;
 		mntput_no_expire(m);
 	}
 }
 EXPORT_SYMBOL(mntput);
 struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
 		mnt_add_count(real_mount(mnt), 1);
 	return mnt;
 }
 EXPORT_SYMBOL(mntget);
 struct vfsmount *mnt_clone_internal(struct path *path)
 {
 	struct mount *p;
 	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
 	if (IS_ERR(p))
 		return ERR_CAST(p);
 	p->mnt.mnt_flags |= MNT_INTERNAL;
 	return &p->mnt;
 }
 static inline void mangle(struct seq_file *m, const char *s)
 {
 	seq_escape(m, s, " \t\n\\");
 }
 /*
  * Simple .show_options callback for filesystems which don't want to
  * implement more complex mount option showing.
  *
  * See also save_mount_options().
  */
 int generic_show_options(struct seq_file *m, struct dentry *root)
 {
 	const char *options;
 	rcu_read_lock();
 	options = rcu_dereference(root->d_sb->s_options);
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
 		mangle(m, options);
 	}
 	rcu_read_unlock();
 	return 0;
 }
 EXPORT_SYMBOL(generic_show_options);
 /*
  * If filesystem uses generic_show_options(), this function should be
  * called from the fill_super() callback.
  *
  * The .remount_fs callback usually needs to be handled in a special
  * way, to make sure, that previous options are not overwritten if the
  * remount fails.
  *
  * Also note, that if the filesystem's .remount_fs function doesn't
  * reset all options to their default value, but changes only newly
  * given options, then the displayed options will not reflect reality
  * any more.
  */
 void save_mount_options(struct super_block *sb, char *options)
 {
 	BUG_ON(sb->s_options);
 	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
 void replace_mount_options(struct super_block *sb, char *options)
 {
 	char *old = sb->s_options;
 	rcu_assign_pointer(sb->s_options, options);
 	if (old) {
 		synchronize_rcu();
 		kfree(old);
 	}
 }
 EXPORT_SYMBOL(replace_mount_options);
 #ifdef CONFIG_PROC_FS
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	down_read(&namespace_sem);
 	if (p->cached_event == p->ns->event) {
 		void *v = p->cached_mount;
 		if (*pos == p->cached_index)
 			return v;
 		if (*pos == p->cached_index + 1) {
 			v = seq_list_next(v, &p->ns->list, &p->cached_index);
 			return p->cached_mount = v;
 		}
 	}
 	p->cached_event = p->ns->event;
 	p->cached_mount = seq_list_start(&p->ns->list, *pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void m_stop(struct seq_file *m, void *v)
 {
 	up_read(&namespace_sem);
 }
 static int m_show(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	struct mount *r = list_entry(v, struct mount, mnt_list);
 	return p->show(m, &r->mnt);
 }
 const struct seq_operations mounts_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= m_show,
 };
 #endif  /* CONFIG_PROC_FS */
 /**
  * may_umount_tree - check if a mount tree is busy
  * @mnt: root of mount tree
  *
  * This is called to check if a tree of mounts has any
  * open files, pwds, chroots or sub mounts that are
  * busy.
  */
 int may_umount_tree(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int actual_refs = 0;
 	int minimum_refs = 0;
 	struct mount *p;
 	BUG_ON(!m);
 	/* write lock needed for mnt_get_count */
 	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
 	unlock_mount_hash();
 	if (actual_refs > minimum_refs)
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL(may_umount_tree);
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
  *
  * This is called to check if a mount point has any
  * open files, pwds, chroots or sub mounts. If the
  * mount has sub mounts this will return busy
  * regardless of whether the sub mounts are busy.
  *
  * Doesn't take quota and stuff into account. IOW, in some cases it will
  * give false negatives. The main reason why it's here is that we need
  * a non-destructive way to look for easily umountable filesystems.
  */
 int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
 	lock_mount_hash();
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
 	unlock_mount_hash();
 	up_read(&namespace_sem);
 	return ret;
 }
 EXPORT_SYMBOL(may_umount);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
 	struct mount *mnt;
 	struct hlist_head head = unmounted;
 	if (likely(hlist_empty(&head))) {
 		up_write(&namespace_sem);
 		return;
 	}
 	head.first->pprev = &head.first;
 	INIT_HLIST_HEAD(&unmounted);
 	/* undo decrements we'd done in umount_tree() */
 	hlist_for_each_entry(mnt, &head, mnt_hash)
 		if (mnt->mnt_ex_mountpoint.mnt)
 			mntget(mnt->mnt_ex_mountpoint.mnt);
 	up_write(&namespace_sem);
 	synchronize_rcu();
 	while (!hlist_empty(&head)) {
 		mnt = hlist_entry(head.first, struct mount, mnt_hash);
 		hlist_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_ex_mountpoint.mnt)
 			path_put(&mnt->mnt_ex_mountpoint);
 		mntput(&mnt->mnt);
 	}
 }
 static inline void namespace_lock(void)
 {
 	down_write(&namespace_sem);
 }
 /*
  * mount_lock must be held
  * namespace_sem must be held for write
  * how = 0 => just this tree, don't propagate
  * how = 1 => propagate; we know that nobody else has reference to any victims
  * how = 2 => lazy umount
  */
 void umount_tree(struct mount *mnt, int how)
 {
 	HLIST_HEAD(tmp_list);
 	struct mount *p;
 	struct mount *last = NULL;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		hlist_del_init_rcu(&p->mnt_hash);
 		hlist_add_head(&p->mnt_hash, &tmp_list);
 	}
 	hlist_for_each_entry(p, &tmp_list, mnt_hash)
 		list_del_init(&p->mnt_child);
 	if (how)
 		propagate_umount(&tmp_list);
 	hlist_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
 		if (how < 2)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		if (mnt_has_parent(p)) {
 			hlist_del_init(&p->mnt_mp_list);
 			put_mountpoint(p->mnt_mp);
 			mnt_add_count(p->mnt_parent, -1);
 			/* move the reference to mountpoint into ->mnt_ex_mountpoint */
 			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
 			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
 			p->mnt_mountpoint = p->mnt.mnt_root;
 			p->mnt_parent = p;
 			p->mnt_mp = NULL;
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 		last = p;
 	}
 	if (last) {
 		last->mnt_hash.next = unmounted.first;
+		if (unmounted.first)
+			unmounted.first->pprev = &last->mnt_hash.next;
 		unmounted.first = tmp_list.first;
 		unmounted.first->pprev = &unmounted.first;
 	}
 }
 static void shrink_submounts(struct mount *mnt);
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
 	int retval;
 	retval = security_sb_umount(&mnt->mnt, flags);
 	if (retval)
 		return retval;
 	/*
 	 * Allow userspace to request a mountpoint be expired rather than
 	 * unmounting unconditionally. Unmount only happens if:
 	 *  (1) the mark is already set (the mark is cleared by mntput())
 	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
 	 */
 	if (flags & MNT_EXPIRE) {
 		if (&mnt->mnt == current->fs->root.mnt ||
 		    flags & (MNT_FORCE | MNT_DETACH))
 			return -EINVAL;
 		/*
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
 		lock_mount_hash();
 		if (mnt_get_count(mnt) != 2) {
 			unlock_mount_hash();
 			return -EBUSY;
 		}
 		unlock_mount_hash();
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
 	}
 	/*
 	 * If we may have to abort operations to get out of this
 	 * mount, and they will themselves hold resources we must
 	 * allow the fs to do things. In the Unix tradition of
 	 * 'Gee thats tricky lets do it in userspace' the umount_begin
 	 * might fail to complete on the first run through as other tasks
 	 * must return, and the like. Thats for the mount program to worry
 	 * about for the moment.
 	 */
 	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
 		sb->s_op->umount_begin(sb);
 	}
 	/*
 	 * No sense to grab the lock for this test, but test itself looks
 	 * somewhat bogus. Suggestions for better replacement?
 	 * Ho-hum... In principle, we might treat that as umount + switch
 	 * to rootfs. GC would eventually take care of the old vfsmount.
 	 * Actually it makes sense, especially if rootfs would contain a
 	 * /reboot - static binary that would close all descriptors and
 	 * call reboot(9). Then init(8) could umount root and exec /reboot.
 	 */
 	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
 		/*
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
 		up_write(&sb->s_umount);
 		return retval;
 	}
 	namespace_lock();
 	lock_mount_hash();
 	event++;
 	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt, 2);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
 			if (!list_empty(&mnt->mnt_list))
 				umount_tree(mnt, 1);
 			retval = 0;
 		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 	return retval;
 }
 /*
  * __detach_mounts - lazily unmount all mounts on the specified dentry
  *
  * During unlink, rmdir, and d_drop it is possible to loose the path
  * to an existing mountpoint, and wind up leaking the mount.
  * detach_mounts allows lazily unmounting those mounts instead of
  * leaking them.
  *
  * The caller may hold dentry->d_inode->i_mutex.
  */
 void __detach_mounts(struct dentry *dentry)
 {
 	struct mountpoint *mp;
 	struct mount *mnt;
 	namespace_lock();
 	mp = lookup_mountpoint(dentry);
 	if (!mp)
 		goto out_unlock;
 	lock_mount_hash();
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		umount_tree(mnt, 2);
 	}
 	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
 	namespace_unlock();
 }
 /*
  * Is the caller allowed to modify his namespace?
  */
 static inline bool may_mount(void)
 {
 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
 }
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
  *
  * We now support a flag for forced unmount like the other 'big iron'
  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  */
 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	struct mount *mnt;
 	int retval;
 	int lookup_flags = 0;
 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
 		return -EINVAL;
 	if (!may_mount())
 		return -EPERM;
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
 	retval = -EINVAL;
 	if (path.dentry != path.mnt->mnt_root)
 		goto dput_and_out;
 	if (!check_mnt(mnt))
 		goto dput_and_out;
 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto dput_and_out;
 	retval = do_umount(mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
 	dput(path.dentry);
 	mntput_no_expire(mnt);
 out:
 	return retval;
 }
 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
 /*
  *	The 2.0 compatible umount. No flags.
  */
 SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
 	return sys_umount(name, 0);
 }
 #endif
 static bool is_mnt_ns_file(struct dentry *dentry)
 {
 	/* Is this a proxy for a mount namespace? */
 	struct inode *inode = dentry->d_inode;
 	struct proc_ns *ei;
 	if (!proc_ns_inode(inode))
 		return false;
 	ei = get_proc_ns(inode);
 	if (ei->ns_ops != &mntns_operations)
 		return false;
 	return true;
 }
 static bool mnt_ns_loop(struct dentry *dentry)
 {
 	/* Could bind mounting the mount namespace inode cause a
 	 * mount namespace loop?
 	 */
 	struct mnt_namespace *mnt_ns;
 	if (!is_mnt_ns_file(dentry))
 		return false;
 	mnt_ns = get_proc_ns(dentry->d_inode)->ns;
 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
 }
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct mount *res, *p, *q, *r, *parent;
 	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
 		return ERR_PTR(-EINVAL);
 	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
 		return ERR_PTR(-EINVAL);
 	res = q = clone_mnt(mnt, dentry, flag);
 	if (IS_ERR(q))
 		return q;
 	q->mnt.mnt_flags &= ~MNT_LOCKED;
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 	p = mnt;
 	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
 		struct mount *s;
 		if (!is_subdir(r->mnt_mountpoint, dentry))
 			continue;
 		for (s = r; s; s = next_mnt(s, r)) {
 			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			if (!(flag & CL_COPY_MNT_NS_FILE) &&
 			    is_mnt_ns_file(s->mnt.mnt_root)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			while (p != s->mnt_parent) {
 				p = p->mnt_parent;
 				q = q->mnt_parent;
 			}
 			p = s;
 			parent = q;
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (IS_ERR(q))
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			mnt_set_mountpoint(parent, p->mnt_mp, q);
 			if (!list_empty(&parent->mnt_mounts)) {
 				t = list_last_entry(&parent->mnt_mounts,
 					struct mount, mnt_child);
 				if (t->mnt_mp != p->mnt_mp)
 					t = NULL;
 			}
 			attach_shadowed(q, parent, t);
 			unlock_mount_hash();
 		}
 	}
 	return res;
 out:
 	if (res) {
 		lock_mount_hash();
 		umount_tree(res, 0);
 		unlock_mount_hash();
 	}
 	return q;
 }
 /* Caller should check returned pointer for errors */
 struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
 	namespace_lock();
 	tree = copy_tree(real_mount(path->mnt), path->dentry,
 			 CL_COPY_ALL | CL_PRIVATE);
 	namespace_unlock();
 	if (IS_ERR(tree))
 		return ERR_CAST(tree);
 	return &tree->mnt;
 }
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
 	umount_tree(real_mount(mnt), 0);
 	unlock_mount_hash();
 	namespace_unlock();
 }
 /**
  * clone_private_mount - create a private clone of a path
  *
  * This creates a new vfsmount, which will be the clone of @path.  The new will
  * not be attached anywhere in the namespace and will be private (i.e. changes
  * to the originating mount won't be propagated into this).
  *
  * Release with mntput().
  */
 struct vfsmount *clone_private_mount(struct path *path)
 {
 	struct mount *old_mnt = real_mount(path->mnt);
 	struct mount *new_mnt;
 	if (IS_MNT_UNBINDABLE(old_mnt))
 		return ERR_PTR(-EINVAL);
 	down_read(&namespace_sem);
 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
 	up_read(&namespace_sem);
 	if (IS_ERR(new_mnt))
 		return ERR_CAST(new_mnt);
 	return &new_mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(clone_private_mount);
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 		   struct vfsmount *root)
 {
 	struct mount *mnt;
 	int res = f(root, arg);
 	if (res)
 		return res;
 	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
 		res = f(&mnt->mnt, arg);
 		if (res)
 			return res;
 	}
 	return 0;
 }
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
 	struct mount *p;
 	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
 		if (p->mnt_group_id && !IS_MNT_SHARED(p))
 			mnt_release_group_id(p);
 	}
 }
 static int invent_group_ids(struct mount *mnt, bool recurse)
 {
 	struct mount *p;
 	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
 		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
 			int err = mnt_alloc_group_id(p);
 			if (err) {
 				cleanup_group_ids(mnt, p);
 				return err;
 			}
 		}
 	}
 	return 0;
 }
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
  *  		   store the parent mount and mountpoint dentry.
  *  		   (done when source_mnt is moved)
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
  * ---------------------------------------------------------------------------
  * |         BIND MOUNT OPERATION                                            |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
  * ***************************************************************************
  * A bind operation clones the source mount and mounts the clone on the
  * destination mount.
  *
  * (++)  the cloned mount is propagated to all the mounts in the propagation
  * 	 tree of the destination mount and the cloned mount is added to
  * 	 the peer group of the source mount.
  * (+)   the cloned mount is created under the destination mount and is marked
  *       as shared. The cloned mount is added to the peer group of the source
  *       mount.
  * (+++) the mount is propagated to all the mounts in the propagation tree
  *       of the destination mount and the cloned mount is made slave
  *       of the same master as that of the source mount. The cloned mount
  *       is marked as 'shared and slave'.
  * (*)   the cloned mount is made a slave of the same master as that of the
  * 	 source mount.
  *
  * ---------------------------------------------------------------------------
  * |         		MOVE MOUNT OPERATION                                 |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
  * ***************************************************************************
  *
  * (+)  the mount is moved to the destination. And is then propagated to
  * 	all the mounts in the propagation tree of the destination mount.
  * (+*)  the mount is moved to the destination.
  * (+++)  the mount is moved to the destination and is then propagated to
  * 	all the mounts belonging to the destination mount's propagation tree.
  * 	the mount is marked as 'shared and slave'.
  * (*)	the mount continues to be a slave at the new location.
  *
  * if the source mount is a tree, the operations explained above is
  * applied to each mount in the tree.
  * Must be called without spinlocks held, since this function can sleep
  * in allocations.
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
 			struct mount *dest_mnt,
 			struct mountpoint *dest_mp,
 			struct path *parent_path)
 {
 	HLIST_HEAD(tree_list);
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
 			goto out;
 		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
 		lock_mount_hash();
 		if (err)
 			goto out_cleanup_ids;
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
 			set_mnt_shared(p);
 	} else {
 		lock_mount_hash();
 	}
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt, NULL);
 	}
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
 		q = __lookup_mnt_last(&child->mnt_parent->mnt,
 				      child->mnt_mountpoint);
 		commit_tree(child, q);
 	}
 	unlock_mount_hash();
 	return 0;
  out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
 		umount_tree(child, 0);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	return err;
 }
 static struct mountpoint *lock_mount(struct path *path)
 {
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
 retry:
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (unlikely(cant_mount(dentry))) {
 		mutex_unlock(&dentry->d_inode->i_mutex);
 		return ERR_PTR(-ENOENT);
 	}
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
 		struct mountpoint *mp = lookup_mountpoint(dentry);
 		if (!mp)
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			return mp;
 		}
 		return mp;
 	}
 	namespace_unlock();
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	path_put(path);
 	path->mnt = mnt;
 	dentry = path->dentry = dget(mnt->mnt_root);
 	goto retry;
 }
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
 	put_mountpoint(where);
 	namespace_unlock();
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 {
 	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
 	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 	return attach_recursive_mnt(mnt, p, mp, NULL);
 }
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
 static int flags_to_propagation_type(int flags)
 {
 	int type = flags & ~(MS_REC | MS_SILENT);
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		return 0;
 	/* Only one propagation flag should be set */
 	if (!is_power_of_2(type))
 		return 0;
 	return type;
 }
 /*
  * recursively change the type of the mountpoint.
  */
 static int do_change_type(struct path *path, int flag)
 {
 	struct mount *m;
 	struct mount *mnt = real_mount(path->mnt);
 	int recurse = flag & MS_REC;
 	int type;
 	int err = 0;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	type = flags_to_propagation_type(flag);
 	if (!type)
 		return -EINVAL;
 	namespace_lock();
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
 			goto out_unlock;
 	}
 	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
 	unlock_mount_hash();
  out_unlock:
 	namespace_unlock();
 	return err;
 }
 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 {
 	struct mount *child;
 	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 		if (!is_subdir(child->mnt_mountpoint, dentry))
 			continue;
 		if (child->mnt.mnt_flags & MNT_LOCKED)
 			return true;
 	}
 	return false;
 }
 /*
  * do loopback mount.
  */
 static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
 	struct mount *mnt = NULL, *old, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
 	if (err)
 		return err;
 	err = -EINVAL;
 	if (mnt_ns_loop(old_path.dentry))
 		goto out;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old))
 		goto out2;
 	if (!check_mnt(parent) || !check_mnt(old))
 		goto out2;
 	if (!recurse && has_locked_children(old, old_path.dentry))
 		goto out2;
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
 	else
 		mnt = clone_mnt(old, old_path.dentry, 0);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
 		umount_tree(mnt, 0);
 		unlock_mount_hash();
 	}
 out2:
 	unlock_mount(mp);
 out:
 	path_put(&old_path);
 	return err;
 }
 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 {
 	int error = 0;
 	int readonly_request = 0;
 	if (ms_flags & MS_RDONLY)
 		readonly_request = 1;
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
 		__mnt_unmake_readonly(real_mount(mnt));
 	return error;
 }
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
 static int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
 	if (!check_mnt(mnt))
 		return -EINVAL;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	/* Don't allow changing of locked mnt flags.
 	 *
 	 * No locks need to be held here while testing the various
 	 * MNT_LOCK flags because those flags can never be cleared
 	 * once they are set.
 	 */
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
 	    !(mnt_flags & MNT_READONLY)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 	    !(mnt_flags & MNT_NODEV)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 	    !(mnt_flags & MNT_NOSUID)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
 	    !(mnt_flags & MNT_NOEXEC)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
 	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
 		return -EPERM;
 	}
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
 		unlock_mount_hash();
 	}
 	up_write(&sb->s_umount);
 	return err;
 }
 static inline int tree_contains_unbindable(struct mount *mnt)
 {
 	struct mount *p;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		if (IS_MNT_UNBINDABLE(p))
 			return 1;
 	}
 	return 0;
 }
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
 	struct mount *p;
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	p = real_mount(path->mnt);
 	err = -EINVAL;
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 	if (old->mnt.mnt_flags & MNT_LOCKED)
 		goto out1;
 	err = -EINVAL;
 	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
 	if (!mnt_has_parent(old))
 		goto out1;
 	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
 	      S_ISDIR(old_path.dentry->d_inode->i_mode))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
 	if (IS_MNT_SHARED(old->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
 		goto out1;
 	err = -ELOOP;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
 			goto out1;
 	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
 	if (err)
 		goto out1;
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
 out1:
 	unlock_mount(mp);
 out:
 	if (!err)
 		path_put(&parent_path);
 	path_put(&old_path);
 	return err;
 }
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
 	int err;
 	const char *subtype = strchr(fstype, '.');
 	if (subtype) {
 		subtype++;
 		err = -EINVAL;
 		if (!subtype[0])
 			goto err;
 	} else
 		subtype = "";
 	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
 	err = -ENOMEM;
 	if (!mnt->mnt_sb->s_subtype)
 		goto err;
 	return mnt;
  err:
 	mntput(mnt);
 	return ERR_PTR(err);
 }
 /*
  * add a mount into a namespace's mount tree
  */
 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 {
 	struct mountpoint *mp;
 	struct mount *parent;
 	int err;
 	mnt_flags &= ~MNT_INTERNAL_FLAGS;
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
 		return PTR_ERR(mp);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (unlikely(!check_mnt(parent))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
 			goto unlock;
 		/* ... and for those we'd better have mountpoint still alive */
 		if (!parent->mnt_ns)
 			goto unlock;
 	}
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
 	    path->mnt->mnt_root == path->dentry)
 		goto unlock;
 	err = -EINVAL;
 	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
 		goto unlock;
 	newmnt->mnt.mnt_flags = mnt_flags;
 	err = graft_tree(newmnt, parent, mp);
 unlock:
 	unlock_mount(mp);
 	return err;
 }
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
 static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 	if (!fstype)
 		return -EINVAL;
 	type = get_fs_type(fstype);
 	if (!type)
 		return -ENODEV;
 	if (user_ns != &init_user_ns) {
 		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
 			put_filesystem(type);
 			return -EPERM;
 		}
 		/* Only in special cases allow devices from mounts
 		 * created outside the initial user namespace.
 		 */
 		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
 	}
 	mnt = vfs_kern_mount(type, flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
 		mnt = fs_set_subtype(mnt, fstype);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
 	return err;
 }
 int finish_automount(struct vfsmount *m, struct path *path)
 {
 	struct mount *mnt = real_mount(m);
 	int err;
 	/* The new mount record should have at least 2 refs to prevent it being
 	 * expired before we get a chance to add it
 	 */
 	BUG_ON(mnt_get_count(mnt) < 2);
 	if (m->mnt_sb == path->mnt->mnt_sb &&
 	    m->mnt_root == path->dentry) {
 		err = -ELOOP;
 		goto fail;
 	}
 	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	if (!err)
 		return 0;
 fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();
 		list_del_init(&mnt->mnt_expire);
 		namespace_unlock();
 	}
 	mntput(m);
 	mntput(m);
 	return err;
 }
 /**
  * mnt_set_expiry - Put a mount on an expiration list
  * @mnt: The mount to list.
  * @expiry_list: The list to add the mount to.
  */
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	namespace_lock();
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 	namespace_unlock();
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
  * here
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
 	struct mount *mnt, *next;
 	LIST_HEAD(graveyard);
 	if (list_empty(mounts))
 		return;
 	namespace_lock();
 	lock_mount_hash();
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
 	 * - only referenced by its parent vfsmount
 	 * - still marked for expiry (marked on the last call here; marks are
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
 			propagate_mount_busy(mnt, 1))
 			continue;
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1);
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 }
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
 /*
  * Ripoff of 'select_parent()'
  *
  * search the list of submounts for a given mountpoint, and move any
  * shrinkable submounts to the 'graveyard' list.
  */
 static int select_submounts(struct mount *parent, struct list_head *graveyard)
 {
 	struct mount *this_parent = parent;
 	struct list_head *next;
 	int found = 0;
 repeat:
 	next = this_parent->mnt_mounts.next;
 resume:
 	while (next != &this_parent->mnt_mounts) {
 		struct list_head *tmp = next;
 		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
 		next = tmp->next;
 		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
 			continue;
 		/*
 		 * Descend a level if the d_mounts list is non-empty.
 		 */
 		if (!list_empty(&mnt->mnt_mounts)) {
 			this_parent = mnt;
 			goto repeat;
 		}
 		if (!propagate_mount_busy(mnt, 1)) {
 			list_move_tail(&mnt->mnt_expire, graveyard);
 			found++;
 		}
 	}
 	/*
 	 * All done at this level ... ascend and resume the search
 	 */
 	if (this_parent != parent) {
 		next = this_parent->mnt_child.next;
 		this_parent = this_parent->mnt_parent;
 		goto resume;
 	}
 	return found;
 }
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
  * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
 	LIST_HEAD(graveyard);
 	struct mount *m;
 	/* extract submounts of 'mountpoint' from the expiration list */
 	while (select_submounts(mnt, &graveyard)) {
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct mount,
 						mnt_expire);
 			touch_mnt_namespace(m->mnt_ns);
 			umount_tree(m, 1);
 		}
 	}
 }
 /*
  * Some copy_from_user() implementations do not return the exact number of
  * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
  * Note that this function differs from copy_from_user() in that it will oops
  * on bad values of `to', rather than returning a short copy.
  */
 static long exact_copy_from_user(void *to, const void __user * from,
 				 unsigned long n)
 {
 	char *t = to;
 	const char __user *f = from;
 	char c;
 	if (!access_ok(VERIFY_READ, from, n))
 		return n;
 	while (n) {
 		if (__get_user(c, f)) {
 			memset(t, 0, n);
 			break;
 		}
 		*t++ = c;
 		f++;
 		n--;
 	}
 	return n;
 }
 int copy_mount_options(const void __user * data, unsigned long *where)
 {
 	int i;
 	unsigned long page;
 	unsigned long size;
 	*where = 0;
 	if (!data)
 		return 0;
 	if (!(page = __get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 	/* We only care that *some* data at the address the user
 	 * gave us is valid.  Just in case, we'll zero
 	 * the remainder of the page.
 	 */
 	/* copy_from_user cannot cross TASK_SIZE ! */
 	size = TASK_SIZE - (unsigned long)data;
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 	i = size - exact_copy_from_user((void *)page, data, size);
 	if (!i) {
 		free_page(page);
 		return -EFAULT;
 	}
 	if (i != PAGE_SIZE)
 		memset((char *)page + i, 0, PAGE_SIZE - i);
 	*where = page;
 	return 0;
 }
 char *copy_mount_string(const void __user *data)
 {
 	return data ? strndup_user(data, PAGE_SIZE) : NULL;
 }
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  *
  * data is a (void *) that can point to any structure up to
  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  * information (or be NULL).
  *
  * Pre-0.97 versions of mount() didn't have a flags word.
  * When the flags word was introduced its top half was required
  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  * Therefore, if this magic number is present, it carries no information
  * and must be discarded.
  */
 long do_mount(const char *dev_name, const char __user *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
 	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
 		flags &= ~MS_MGC_MSK;
 	/* Basic sanity checks */
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 	/* ... and get the mountpoint */
 	retval = user_path(dir_name, &path);
 	if (retval)
 		return retval;
 	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
 	if (!retval && !may_mount())
 		retval = -EPERM;
 	if (retval)
 		goto dput_out;
 	/* Default to relatime unless overriden */
 	if (!(flags & MS_NOATIME))
 		mnt_flags |= MNT_RELATIME;
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
 	if (flags & MS_NODEV)
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
 	if (flags & MS_NOATIME)
 		mnt_flags |= MNT_NOATIME;
 	if (flags & MS_NODIRATIME)
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_STRICTATIME)
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 	/* The default atime for remount is preservation */
 	if ((flags & MS_REMOUNT) &&
 	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
 		       MS_STRICTATIME)) == 0)) {
 		mnt_flags &= ~MNT_ATIME_MASK;
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
 	path_put(&path);
 	return retval;
 }
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	proc_free_inum(ns->proc_inum);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
 /*
  * Assign a sequence number so we can detect when we attempt to bind
  * mount a reference to an older mount namespace into the current
  * mount namespace, preventing reference counting loops.  A 64bit
  * number incrementing at 10Ghz will take 12,427 years to wrap which
  * is effectively never, so we can ignore the possibility.
  */
 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
 	int ret;
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
 	ret = proc_alloc_inum(&new_ns->proc_inum);
 	if (ret) {
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
 	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
 	struct mount *old;
 	struct mount *new;
 	int copy_flags;
 	BUG_ON(!ns);
 	if (likely(!(flags & CLONE_NEWNS))) {
 		get_mnt_ns(ns);
 		return ns;
 	}
 	old = ns->root;
 	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
 	namespace_lock();
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
 		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
 	list_add_tail(&new_ns->list, &new->mnt_list);
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
 	 * as belonging to new namespace.  We have already acquired a private
 	 * fs_struct, so tsk->fs->lock is not needed.
 	 */
 	p = old;
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
 				rootmnt = &p->mnt;
 			}
 			if (&p->mnt == new_fs->pwd.mnt) {
 				new_fs->pwd.mnt = mntget(&q->mnt);
 				pwdmnt = &p->mnt;
 			}
 		}
 		p = next_mnt(p, old);
 		q = next_mnt(q, new);
 		if (!q)
 			break;
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(p, old);
 	}
 	namespace_unlock();
 	if (rootmnt)
 		mntput(rootmnt);
 	if (pwdmnt)
 		mntput(pwdmnt);
 	return new_ns;
 }
 /**
  * create_mnt_ns - creates a private namespace and adds a root filesystem
  * @mnt: pointer to the new root filesystem mountpoint
  */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
 	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
 		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
 	}
 	return new_ns;
 }
 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
 {
 	struct mnt_namespace *ns;
 	struct super_block *s;
 	struct path path;
 	int err;
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		return ERR_CAST(ns);
 	err = vfs_path_lookup(mnt->mnt_root, mnt,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
 	put_mnt_ns(ns);
 	if (err)
 		return ERR_PTR(err);
 	/* trade a vfsmount reference for active sb one */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
 	mntput(path.mnt);
 	/* lock the sucker */
 	down_write(&s->s_umount);
 	/* ... and return the root of (sub)tree on it */
 	return path.dentry;
 }
 EXPORT_SYMBOL(mount_subtree);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
 	int ret;
 	char *kernel_type;
 	char *kernel_dev;
 	unsigned long data_page;
 	kernel_type = copy_mount_string(type);
 	ret = PTR_ERR(kernel_type);
 	if (IS_ERR(kernel_type))
 		goto out_type;
 	kernel_dev = copy_mount_string(dev_name);
 	ret = PTR_ERR(kernel_dev);
 	if (IS_ERR(kernel_dev))
 		goto out_dev;
 	ret = copy_mount_options(data, &data_page);
 	if (ret < 0)
 		goto out_data;
 	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
 		(void *) data_page);
 	free_page(data_page);
 out_data:
 	kfree(kernel_dev);
 out_dev:
 	kfree(kernel_type);
 out_type:
 	return ret;
 }
 /*
  * Return true if path is reachable from root
  *
  * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
 {
 	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
 		dentry = mnt->mnt_mountpoint;
 		mnt = mnt->mnt_parent;
 	}
 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
 }
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
 	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
 	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
  * makes new_root as the new root file system of the current process, and sets
  * root/cwd of all processes which had them on the current root to new_root.
  *
  * Restrictions:
  * The new_root and put_old must be directories, and  must not be on the
  * same file  system as the current process root. The put_old  must  be
  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
  * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
  * in this situation.
  *
  * Notes:
  *  - we don't move root/cwd if they are not at the root (reason: if something
  *    cared enough to change them, it's probably wrong to force them elsewhere)
  *  - it's okay to pick a root that isn't the root of a file system, e.g.
  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		const char __user *, put_old)
 {
 	struct path new, old, parent_path, root_parent, root;
 	struct mount *new_mnt, *root_mnt, *old_mnt;
 	struct mountpoint *old_mp, *root_mp;
 	int error;
 	if (!may_mount())
 		return -EPERM;
 	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
 	error = user_path_dir(put_old, &old);
 	if (error)
 		goto out1;
 	error = security_sb_pivotroot(&old, &new);
 	if (error)
 		goto out2;
 	get_fs_root(current->fs, &root);
 	old_mp = lock_mount(&old);
 	error = PTR_ERR(old_mp);
 	if (IS_ERR(old_mp))
 		goto out3;
 	error = -EINVAL;
 	new_mnt = real_mount(new.mnt);
 	root_mnt = real_mount(root.mnt);
 	old_mnt = real_mount(old.mnt);
 	if (IS_MNT_SHARED(old_mnt) ||
 		IS_MNT_SHARED(new_mnt->mnt_parent) ||
 		IS_MNT_SHARED(root_mnt->mnt_parent))
 		goto out4;
 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
 		goto out4;
 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto out4;
 	error = -ENOENT;
 	if (d_unlinked(new.dentry))
 		goto out4;
 	error = -EBUSY;
 	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		goto out4; /* not attached */
 	root_mp = root_mnt->mnt_mp;
 	if (new.mnt->mnt_root != new.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
 	/* make certain new is below the root */
 	if (!is_path_reachable(new_mnt, new.dentry, &root))
 		goto out4;
 	root_mp->m_count++; /* pin it so it won't go away */
 	lock_mount_hash();
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
 	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
 	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
 	if (!error) {
 		path_put(&root_parent);
 		path_put(&parent_path);
 	}
 out3:
 	path_put(&root);
 out2:
 	path_put(&old);
 out1:
 	path_put(&new);
 out0:
 	return error;
 }
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
 	struct mnt_namespace *ns;
 	struct path root;
 	struct file_system_type *type;
 	type = get_fs_type("rootfs");
 	if (!type)
 		panic("Can't find rootfs type");
 	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 	root.mnt = mnt;
 	root.dentry = mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 }
 void __init mnt_init(void)
 {
 	unsigned u;
 	int err;
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
 				mhash_entries, 19,
 				0,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
 				sizeof(struct hlist_head),
 				mphash_entries, 19,
 				0,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 	for (u = 0; u <= m_hash_mask; u++)
 		INIT_HLIST_HEAD(&mount_hashtable[u]);
 	for (u = 0; u <= mp_hash_mask; u++)
 		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
 	kernfs_init();
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
 			__func__, err);
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
 	init_rootfs();
 	init_mount_tree();
 }
 void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	drop_collected_mounts(&ns->root->mnt);
 	free_mnt_ns(ns);
 }
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 {
 	struct vfsmount *mnt;
 	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
 		 * we unmount before file sys is unregistered
 		*/
 		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
 	}
 	return mnt;
 }
 EXPORT_SYMBOL_GPL(kern_mount_data);
 void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
 		real_mount(mnt)->mnt_ns = NULL;
 		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
 EXPORT_SYMBOL(kern_unmount);
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
 }
 bool current_chrooted(void)
 {
 	/* Does the current process have a non-standard root */
 	struct path ns_root;
 	struct path fs_root;
 	bool chrooted;
 	/* Find the namespace root */
 	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
 	ns_root.dentry = ns_root.mnt->mnt_root;
 	path_get(&ns_root);
 	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
 		;
 	get_fs_root(current->fs, &fs_root);
 	chrooted = !path_equal(&fs_root, &ns_root);
 	path_put(&fs_root);
 	path_put(&ns_root);
 	return chrooted;
 }
 bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool visible = false;
 	if (unlikely(!ns))
 		return false;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		if (mnt->mnt.mnt_sb->s_type != type)
 			continue;
 		/* This mount is not fully visible if there are any child mounts
 		 * that cover anything except for empty directories.
 		 */
 		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 			struct inode *inode = child->mnt_mountpoint->d_inode;
 			if (!S_ISDIR(inode->i_mode))
 				goto next;
 			if (inode->i_nlink > 2)
 				goto next;
 		}
 		visible = true;
 		goto found;
 	next:	;
 	}
 found:
 	up_read(&namespace_sem);
 	return visible;
 }
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 	task_lock(task);
 	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->mnt_ns;
 		get_mnt_ns(ns);
 	}
 	task_unlock(task);
 	return ns;
 }
 static void mntns_put(void *ns)
 {
 	put_mnt_ns(ns);
 }
 static int mntns_install(struct nsproxy *nsproxy, void *ns)
 {
 	struct fs_struct *fs = current->fs;
 	struct mnt_namespace *mnt_ns = ns;
 	struct path root;
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 	if (fs->users != 1)
 		return -EINVAL;
 	get_mnt_ns(mnt_ns);
 	put_mnt_ns(nsproxy->mnt_ns);
 	nsproxy->mnt_ns = mnt_ns;
 	/* Find the root */
 	root.mnt    = &mnt_ns->root->mnt;
 	root.dentry = mnt_ns->root->mnt.mnt_root;
 	path_get(&root);
 	while(d_mountpoint(root.dentry) && follow_down_one(&root))
 		;
 	/* Update the pwd and root */
 	set_fs_pwd(fs, &root);
 	set_fs_root(fs, &root);
 	path_put(&root);
 	return 0;
 }
 static unsigned int mntns_inum(void *ns)
 {
 	struct mnt_namespace *mnt_ns = ns;
 	return mnt_ns->proc_inum;
 }
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
 	.inum		= mntns_inum,
 };