Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/fs/namespace.c

2

* linux/fs/namespace.c

3

*

3

*

4

* (C) Copyright Al Viro 2000, 2001

4

* (C) Copyright Al Viro 2000, 2001

5

* Released under GPL v2.

5

* Released under GPL v2.

6

*

6

*

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

8

* Heavily rewritten.

8

* Heavily rewritten.

9

*/

9

*/

10

11

#include <linux/syscalls.h>

11

#include <linux/syscalls.h>

12

#include <linux/export.h>

12

#include <linux/export.h>

13

#include <linux/capability.h>

13

#include <linux/capability.h>

14

#include <linux/mnt_namespace.h>

14

#include <linux/mnt_namespace.h>

15

#include <linux/user_namespace.h>

15

#include <linux/user_namespace.h>

16

#include <linux/namei.h>

16

#include <linux/namei.h>

17

#include <linux/security.h>

17

#include <linux/security.h>

18

#include <linux/idr.h>

18

#include <linux/idr.h>

19

#include <linux/init.h> /* init_rootfs */

19

#include <linux/init.h> /* init_rootfs */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

22

#include <linux/uaccess.h>

22

#include <linux/uaccess.h>

23

#include <linux/proc_ns.h>

23

#include <linux/proc_ns.h>

24

#include <linux/magic.h>

24

#include <linux/magic.h>

25

#include <linux/bootmem.h>

25

#include <linux/bootmem.h>

26

#include <linux/task_work.h>

26

#include <linux/task_work.h>

27

#include "pnode.h"

27

#include "pnode.h"

28

#include "internal.h"

28

#include "internal.h"

29

30

static unsigned int m_hash_mask __read_mostly;

30

static unsigned int m_hash_mask __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

34

35

static __initdata unsigned long mhash_entries;

35

static __initdata unsigned long mhash_entries;

36

static int __init set_mhash_entries(char *str)

36

static int __init set_mhash_entries(char *str)

37

{

37

{

38

if (!str)

38

if (!str)

39

return 0;

39

return 0;

40

mhash_entries = simple_strtoul(str, &str, 0);

40

mhash_entries = simple_strtoul(str, &str, 0);

41

return 1;

41

return 1;

42

}

42

}

43

__setup("mhash_entries=", set_mhash_entries);

43

__setup("mhash_entries=", set_mhash_entries);

44

45

static __initdata unsigned long mphash_entries;

45

static __initdata unsigned long mphash_entries;

46

static int __init set_mphash_entries(char *str)

46

static int __init set_mphash_entries(char *str)

47

{

47

{

48

if (!str)

48

if (!str)

49

return 0;

49

return 0;

50

mphash_entries = simple_strtoul(str, &str, 0);

50

mphash_entries = simple_strtoul(str, &str, 0);

51

return 1;

51

return 1;

52

}

52

}

53

__setup("mphash_entries=", set_mphash_entries);

53

__setup("mphash_entries=", set_mphash_entries);

54

55

static u64 event;

55

static u64 event;

56

static DEFINE_IDA(mnt_id_ida);

56

static DEFINE_IDA(mnt_id_ida);

57

static DEFINE_IDA(mnt_group_ida);

57

static DEFINE_IDA(mnt_group_ida);

58

static DEFINE_SPINLOCK(mnt_id_lock);

58

static DEFINE_SPINLOCK(mnt_id_lock);

59

static int mnt_id_start = 0;

59

static int mnt_id_start = 0;

60

static int mnt_group_start = 1;

60

static int mnt_group_start = 1;

61

62

static struct hlist_head *mount_hashtable __read_mostly;

62

static struct hlist_head *mount_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

65

static DECLARE_RWSEM(namespace_sem);

65

static DECLARE_RWSEM(namespace_sem);

66

67

/* /sys/fs */

67

/* /sys/fs */

68

struct kobject *fs_kobj;

68

struct kobject *fs_kobj;

69

EXPORT_SYMBOL_GPL(fs_kobj);

69

EXPORT_SYMBOL_GPL(fs_kobj);

70

71

/*

71

/*

72

* vfsmount lock may be taken for read to prevent changes to the

72

* vfsmount lock may be taken for read to prevent changes to the

73

* vfsmount hash, ie. during mountpoint lookups or walking back

73

* vfsmount hash, ie. during mountpoint lookups or walking back

74

* up the tree.

74

* up the tree.

75

*

75

*

76

* It should be taken for write in all cases where the vfsmount

76

* It should be taken for write in all cases where the vfsmount

77

* tree or hash is modified or when a vfsmount structure is modified.

77

* tree or hash is modified or when a vfsmount structure is modified.

78

*/

78

*/

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

80

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

82

{

82

{

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

85

tmp = tmp + (tmp >> m_hash_shift);

85

tmp = tmp + (tmp >> m_hash_shift);

86

return &mount_hashtable[tmp & m_hash_mask];

86

return &mount_hashtable[tmp & m_hash_mask];

87

}

87

}

88

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

90

{

90

{

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

92

tmp = tmp + (tmp >> mp_hash_shift);

92

tmp = tmp + (tmp >> mp_hash_shift);

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

94

}

94

}

95

96

/*

96

/*

97

* allocation is serialized by namespace_sem, but we need the spinlock to

97

* allocation is serialized by namespace_sem, but we need the spinlock to

98

* serialize with freeing.

98

* serialize with freeing.

99

*/

99

*/

100

static int mnt_alloc_id(struct mount *mnt)

100

static int mnt_alloc_id(struct mount *mnt)

101

{

101

{

102

int res;

102

int res;

103

104

retry:

104

retry:

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

106

spin_lock(&mnt_id_lock);

106

spin_lock(&mnt_id_lock);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

108

if (!res)

108

if (!res)

109

mnt_id_start = mnt->mnt_id + 1;

109

mnt_id_start = mnt->mnt_id + 1;

110

spin_unlock(&mnt_id_lock);

110

spin_unlock(&mnt_id_lock);

111

if (res == -EAGAIN)

111

if (res == -EAGAIN)

112

goto retry;

112

goto retry;

113

114

return res;

114

return res;

115

}

115

}

116

117

static void mnt_free_id(struct mount *mnt)

117

static void mnt_free_id(struct mount *mnt)

118

{

118

{

119

int id = mnt->mnt_id;

119

int id = mnt->mnt_id;

120

spin_lock(&mnt_id_lock);

120

spin_lock(&mnt_id_lock);

121

ida_remove(&mnt_id_ida, id);

121

ida_remove(&mnt_id_ida, id);

122

if (mnt_id_start > id)

122

if (mnt_id_start > id)

123

mnt_id_start = id;

123

mnt_id_start = id;

124

spin_unlock(&mnt_id_lock);

124

spin_unlock(&mnt_id_lock);

125

}

125

}

126

127

/*

127

/*

128

* Allocate a new peer group ID

128

* Allocate a new peer group ID

129

*

129

*

130

* mnt_group_ida is protected by namespace_sem

130

* mnt_group_ida is protected by namespace_sem

131

*/

131

*/

132

static int mnt_alloc_group_id(struct mount *mnt)

132

static int mnt_alloc_group_id(struct mount *mnt)

133

{

133

{

134

int res;

134

int res;

135

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

137

return -ENOMEM;

137

return -ENOMEM;

138

139

res = ida_get_new_above(&mnt_group_ida,

139

res = ida_get_new_above(&mnt_group_ida,

140

mnt_group_start,

140

mnt_group_start,

141

&mnt->mnt_group_id);

141

&mnt->mnt_group_id);

142

if (!res)

142

if (!res)

143

mnt_group_start = mnt->mnt_group_id + 1;

143

mnt_group_start = mnt->mnt_group_id + 1;

144

145

return res;

145

return res;

146

}

146

}

147

148

/*

148

/*

149

* Release a peer group ID

149

* Release a peer group ID

150

*/

150

*/

151

void mnt_release_group_id(struct mount *mnt)

151

void mnt_release_group_id(struct mount *mnt)

152

{

152

{

153

int id = mnt->mnt_group_id;

153

int id = mnt->mnt_group_id;

154

ida_remove(&mnt_group_ida, id);

154

ida_remove(&mnt_group_ida, id);

155

if (mnt_group_start > id)

155

if (mnt_group_start > id)

156

mnt_group_start = id;

156

mnt_group_start = id;

157

mnt->mnt_group_id = 0;

157

mnt->mnt_group_id = 0;

158

}

158

}

159

160

/*

160

/*

161

* vfsmount lock must be held for read

161

* vfsmount lock must be held for read

162

*/

162

*/

163

static inline void mnt_add_count(struct mount *mnt, int n)

163

static inline void mnt_add_count(struct mount *mnt, int n)

164

{

164

{

165

#ifdef CONFIG_SMP

165

#ifdef CONFIG_SMP

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

167

#else

167

#else

168

preempt_disable();

168

preempt_disable();

169

mnt->mnt_count += n;

169

mnt->mnt_count += n;

170

preempt_enable();

170

preempt_enable();

171

#endif

171

#endif

172

}

172

}

173

174

/*

174

/*

175

* vfsmount lock must be held for write

175

* vfsmount lock must be held for write

176

*/

176

*/

177

unsigned int mnt_get_count(struct mount *mnt)

177

unsigned int mnt_get_count(struct mount *mnt)

178

{

178

{

179

#ifdef CONFIG_SMP

179

#ifdef CONFIG_SMP

180

unsigned int count = 0;

180

unsigned int count = 0;

181

int cpu;

181

int cpu;

182

183

for_each_possible_cpu(cpu) {

183

for_each_possible_cpu(cpu) {

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

185

}

185

}

186

187

return count;

187

return count;

188

#else

188

#else

189

return mnt->mnt_count;

189

return mnt->mnt_count;

190

#endif

190

#endif

191

}

191

}

192

193

static struct mount *alloc_vfsmnt(const char *name)

193

static struct mount *alloc_vfsmnt(const char *name)

194

{

194

{

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

196

if (mnt) {

196

if (mnt) {

197

int err;

197

int err;

198

199

err = mnt_alloc_id(mnt);

199

err = mnt_alloc_id(mnt);

200

if (err)

200

if (err)

201

goto out_free_cache;

201

goto out_free_cache;

202

203

if (name) {

203

if (name) {

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

205

if (!mnt->mnt_devname)

205

if (!mnt->mnt_devname)

206

goto out_free_id;

206

goto out_free_id;

207

}

207

}

208

209

#ifdef CONFIG_SMP

209

#ifdef CONFIG_SMP

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

211

if (!mnt->mnt_pcp)

211

if (!mnt->mnt_pcp)

212

goto out_free_devname;

212

goto out_free_devname;

213

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

215

#else

215

#else

216

mnt->mnt_count = 1;

216

mnt->mnt_count = 1;

217

mnt->mnt_writers = 0;

217

mnt->mnt_writers = 0;

218

#endif

218

#endif

219

220

INIT_HLIST_NODE(&mnt->mnt_hash);

220

INIT_HLIST_NODE(&mnt->mnt_hash);

221

INIT_LIST_HEAD(&mnt->mnt_child);

221

INIT_LIST_HEAD(&mnt->mnt_child);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

223

INIT_LIST_HEAD(&mnt->mnt_list);

223

INIT_LIST_HEAD(&mnt->mnt_list);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

225

INIT_LIST_HEAD(&mnt->mnt_share);

225

INIT_LIST_HEAD(&mnt->mnt_share);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

229

#ifdef CONFIG_FSNOTIFY

229

#ifdef CONFIG_FSNOTIFY

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

231

#endif

231

#endif

232

}

232

}

233

return mnt;

233

return mnt;

234

235

#ifdef CONFIG_SMP

235

#ifdef CONFIG_SMP

236

out_free_devname:

236

out_free_devname:

237

kfree(mnt->mnt_devname);

237

kfree(mnt->mnt_devname);

238

#endif

238

#endif

239

out_free_id:

239

out_free_id:

240

mnt_free_id(mnt);

240

mnt_free_id(mnt);

241

out_free_cache:

241

out_free_cache:

242

kmem_cache_free(mnt_cache, mnt);

242

kmem_cache_free(mnt_cache, mnt);

243

return NULL;

243

return NULL;

244

}

244

}

245

246

/*

246

/*

247

* Most r/o checks on a fs are for operations that take

247

* Most r/o checks on a fs are for operations that take

248

* discrete amounts of time, like a write() or unlink().

248

* discrete amounts of time, like a write() or unlink().

249

* We must keep track of when those operations start

249

* We must keep track of when those operations start

250

* (for permission checks) and when they end, so that

250

* (for permission checks) and when they end, so that

251

* we can determine when writes are able to occur to

251

* we can determine when writes are able to occur to

252

* a filesystem.

252

* a filesystem.

253

*/

253

*/

254

/*

254

/*

255

* __mnt_is_readonly: check whether a mount is read-only

255

* __mnt_is_readonly: check whether a mount is read-only

256

* @mnt: the mount to check for its write status

256

* @mnt: the mount to check for its write status

257

*

257

*

258

* This shouldn't be used directly ouside of the VFS.

258

* This shouldn't be used directly ouside of the VFS.

259

* It does not guarantee that the filesystem will stay

259

* It does not guarantee that the filesystem will stay

260

* r/w, just that it is right *now*. This can not and

260

* r/w, just that it is right *now*. This can not and

261

* should not be used in place of IS_RDONLY(inode).

261

* should not be used in place of IS_RDONLY(inode).

262

* mnt_want/drop_write() will _keep_ the filesystem

262

* mnt_want/drop_write() will _keep_ the filesystem

263

* r/w.

263

* r/w.

264

*/

264

*/

265

int __mnt_is_readonly(struct vfsmount *mnt)

265

int __mnt_is_readonly(struct vfsmount *mnt)

266

{

266

{

267

if (mnt->mnt_flags & MNT_READONLY)

267

if (mnt->mnt_flags & MNT_READONLY)

268

return 1;

268

return 1;

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

270

return 1;

270

return 1;

271

return 0;

271

return 0;

272

}

272

}

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

274

275

static inline void mnt_inc_writers(struct mount *mnt)

275

static inline void mnt_inc_writers(struct mount *mnt)

276

{

276

{

277

#ifdef CONFIG_SMP

277

#ifdef CONFIG_SMP

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

279

#else

279

#else

280

mnt->mnt_writers++;

280

mnt->mnt_writers++;

281

#endif

281

#endif

282

}

282

}

283

284

static inline void mnt_dec_writers(struct mount *mnt)

284

static inline void mnt_dec_writers(struct mount *mnt)

285

{

285

{

286

#ifdef CONFIG_SMP

286

#ifdef CONFIG_SMP

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

288

#else

288

#else

289

mnt->mnt_writers--;

289

mnt->mnt_writers--;

290

#endif

290

#endif

291

}

291

}

292

293

static unsigned int mnt_get_writers(struct mount *mnt)

293

static unsigned int mnt_get_writers(struct mount *mnt)

294

{

294

{

295

#ifdef CONFIG_SMP

295

#ifdef CONFIG_SMP

296

unsigned int count = 0;

296

unsigned int count = 0;

297

int cpu;

297

int cpu;

298

299

for_each_possible_cpu(cpu) {

299

for_each_possible_cpu(cpu) {

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

301

}

301

}

302

303

return count;

303

return count;

304

#else

304

#else

305

return mnt->mnt_writers;

305

return mnt->mnt_writers;

306

#endif

306

#endif

307

}

307

}

308

309

static int mnt_is_readonly(struct vfsmount *mnt)

309

static int mnt_is_readonly(struct vfsmount *mnt)

310

{

310

{

311

if (mnt->mnt_sb->s_readonly_remount)

311

if (mnt->mnt_sb->s_readonly_remount)

312

return 1;

312

return 1;

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

314

smp_rmb();

314

smp_rmb();

315

return __mnt_is_readonly(mnt);

315

return __mnt_is_readonly(mnt);

316

}

316

}

317

318

/*

318

/*

319

* Most r/o & frozen checks on a fs are for operations that take discrete

319

* Most r/o & frozen checks on a fs are for operations that take discrete

320

* amounts of time, like a write() or unlink(). We must keep track of when

320

* amounts of time, like a write() or unlink(). We must keep track of when

321

* those operations start (for permission checks) and when they end, so that we

321

* those operations start (for permission checks) and when they end, so that we

322

* can determine when writes are able to occur to a filesystem.

322

* can determine when writes are able to occur to a filesystem.

323

*/

323

*/

324

/**

324

/**

325

* __mnt_want_write - get write access to a mount without freeze protection

325

* __mnt_want_write - get write access to a mount without freeze protection

326

* @m: the mount on which to take a write

326

* @m: the mount on which to take a write

327

*

327

*

328

* This tells the low-level filesystem that a write is about to be performed to

328

* This tells the low-level filesystem that a write is about to be performed to

329

* it, and makes sure that writes are allowed (mnt it read-write) before

329

* it, and makes sure that writes are allowed (mnt it read-write) before

330

* returning success. This operation does not protect against filesystem being

330

* returning success. This operation does not protect against filesystem being

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

332

* called. This is effectively a refcount.

332

* called. This is effectively a refcount.

333

*/

333

*/

334

int __mnt_want_write(struct vfsmount *m)

334

int __mnt_want_write(struct vfsmount *m)

335

{

335

{

336

struct mount *mnt = real_mount(m);

336

struct mount *mnt = real_mount(m);

337

int ret = 0;

337

int ret = 0;

338

339

preempt_disable();

339

preempt_disable();

340

mnt_inc_writers(mnt);

340

mnt_inc_writers(mnt);

341

/*

341

/*

342

* The store to mnt_inc_writers must be visible before we pass

342

* The store to mnt_inc_writers must be visible before we pass

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

344

* incremented count after it has set MNT_WRITE_HOLD.

344

* incremented count after it has set MNT_WRITE_HOLD.

345

*/

345

*/

346

smp_mb();

346

smp_mb();

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

348

cpu_relax();

348

cpu_relax();

349

/*

349

/*

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

351

* be set to match its requirements. So we must not load that until

351

* be set to match its requirements. So we must not load that until

352

* MNT_WRITE_HOLD is cleared.

352

* MNT_WRITE_HOLD is cleared.

353

*/

353

*/

354

smp_rmb();

354

smp_rmb();

355

if (mnt_is_readonly(m)) {

355

if (mnt_is_readonly(m)) {

356

mnt_dec_writers(mnt);

356

mnt_dec_writers(mnt);

357

ret = -EROFS;

357

ret = -EROFS;

358

}

358

}

359

preempt_enable();

359

preempt_enable();

360

361

return ret;

361

return ret;

362

}

362

}

363

364

/**

364

/**

365

* mnt_want_write - get write access to a mount

365

* mnt_want_write - get write access to a mount

366

* @m: the mount on which to take a write

366

* @m: the mount on which to take a write

367

*

367

*

368

* This tells the low-level filesystem that a write is about to be performed to

368

* This tells the low-level filesystem that a write is about to be performed to

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

370

* is not frozen) before returning success. When the write operation is

370

* is not frozen) before returning success. When the write operation is

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

372

*/

372

*/

373

int mnt_want_write(struct vfsmount *m)

373

int mnt_want_write(struct vfsmount *m)

374

{

374

{

375

int ret;

375

int ret;

376

377

sb_start_write(m->mnt_sb);

377

sb_start_write(m->mnt_sb);

378

ret = __mnt_want_write(m);

378

ret = __mnt_want_write(m);

379

if (ret)

379

if (ret)

380

sb_end_write(m->mnt_sb);

380

sb_end_write(m->mnt_sb);

381

return ret;

381

return ret;

382

}

382

}

383

EXPORT_SYMBOL_GPL(mnt_want_write);

383

EXPORT_SYMBOL_GPL(mnt_want_write);

384

385

/**

385

/**

386

* mnt_clone_write - get write access to a mount

386

* mnt_clone_write - get write access to a mount

387

* @mnt: the mount on which to take a write

387

* @mnt: the mount on which to take a write

388

*

388

*

389

* This is effectively like mnt_want_write, except

389

* This is effectively like mnt_want_write, except

390

* it must only be used to take an extra write reference

390

* it must only be used to take an extra write reference

391

* on a mountpoint that we already know has a write reference

391

* on a mountpoint that we already know has a write reference

392

* on it. This allows some optimisation.

392

* on it. This allows some optimisation.

393

*

393

*

394

* After finished, mnt_drop_write must be called as usual to

394

* After finished, mnt_drop_write must be called as usual to

395

* drop the reference.

395

* drop the reference.

396

*/

396

*/

397

int mnt_clone_write(struct vfsmount *mnt)

397

int mnt_clone_write(struct vfsmount *mnt)

398

{

398

{

399

/* superblock may be r/o */

399

/* superblock may be r/o */

400

if (__mnt_is_readonly(mnt))

400

if (__mnt_is_readonly(mnt))

401

return -EROFS;

401

return -EROFS;

402

preempt_disable();

402

preempt_disable();

403

mnt_inc_writers(real_mount(mnt));

403

mnt_inc_writers(real_mount(mnt));

404

preempt_enable();

404

preempt_enable();

405

return 0;

405

return 0;

406

}

406

}

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

408

409

/**

409

/**

410

* __mnt_want_write_file - get write access to a file's mount

410

* __mnt_want_write_file - get write access to a file's mount

411

* @file: the file who's mount on which to take a write

411

* @file: the file who's mount on which to take a write

412

*

412

*

413

* This is like __mnt_want_write, but it takes a file and can

413

* This is like __mnt_want_write, but it takes a file and can

414

* do some optimisations if the file is open for write already

414

* do some optimisations if the file is open for write already

415

*/

415

*/

416

int __mnt_want_write_file(struct file *file)

416

int __mnt_want_write_file(struct file *file)

417

{

417

{

418

if (!(file->f_mode & FMODE_WRITER))

418

if (!(file->f_mode & FMODE_WRITER))

419

return __mnt_want_write(file->f_path.mnt);

419

return __mnt_want_write(file->f_path.mnt);

420

else

420

else

421

return mnt_clone_write(file->f_path.mnt);

421

return mnt_clone_write(file->f_path.mnt);

422

}

422

}

423

424

/**

424

/**

425

* mnt_want_write_file - get write access to a file's mount

425

* mnt_want_write_file - get write access to a file's mount

426

* @file: the file who's mount on which to take a write

426

* @file: the file who's mount on which to take a write

427

*

427

*

428

* This is like mnt_want_write, but it takes a file and can

428

* This is like mnt_want_write, but it takes a file and can

429

* do some optimisations if the file is open for write already

429

* do some optimisations if the file is open for write already

430

*/

430

*/

431

int mnt_want_write_file(struct file *file)

431

int mnt_want_write_file(struct file *file)

432

{

432

{

433

int ret;

433

int ret;

434

435

sb_start_write(file->f_path.mnt->mnt_sb);

435

sb_start_write(file->f_path.mnt->mnt_sb);

436

ret = __mnt_want_write_file(file);

436

ret = __mnt_want_write_file(file);

437

if (ret)

437

if (ret)

438

sb_end_write(file->f_path.mnt->mnt_sb);

438

sb_end_write(file->f_path.mnt->mnt_sb);

439

return ret;

439

return ret;

440

}

440

}

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

442

443

/**

443

/**

444

* __mnt_drop_write - give up write access to a mount

444

* __mnt_drop_write - give up write access to a mount

445

* @mnt: the mount on which to give up write access

445

* @mnt: the mount on which to give up write access

446

*

446

*

447

* Tells the low-level filesystem that we are done

447

* Tells the low-level filesystem that we are done

448

* performing writes to it. Must be matched with

448

* performing writes to it. Must be matched with

449

* __mnt_want_write() call above.

449

* __mnt_want_write() call above.

450

*/

450

*/

451

void __mnt_drop_write(struct vfsmount *mnt)

451

void __mnt_drop_write(struct vfsmount *mnt)

452

{

452

{

453

preempt_disable();

453

preempt_disable();

454

mnt_dec_writers(real_mount(mnt));

454

mnt_dec_writers(real_mount(mnt));

455

preempt_enable();

455

preempt_enable();

456

}

456

}

457

458

/**

458

/**

459

* mnt_drop_write - give up write access to a mount

459

* mnt_drop_write - give up write access to a mount

460

* @mnt: the mount on which to give up write access

460

* @mnt: the mount on which to give up write access

461

*

461

*

462

* Tells the low-level filesystem that we are done performing writes to it and

462

* Tells the low-level filesystem that we are done performing writes to it and

463

* also allows filesystem to be frozen again. Must be matched with

463

* also allows filesystem to be frozen again. Must be matched with

464

* mnt_want_write() call above.

464

* mnt_want_write() call above.

465

*/

465

*/

466

void mnt_drop_write(struct vfsmount *mnt)

466

void mnt_drop_write(struct vfsmount *mnt)

467

{

467

{

468

__mnt_drop_write(mnt);

468

__mnt_drop_write(mnt);

469

sb_end_write(mnt->mnt_sb);

469

sb_end_write(mnt->mnt_sb);

470

}

470

}

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

472

473

void __mnt_drop_write_file(struct file *file)

473

void __mnt_drop_write_file(struct file *file)

474

{

474

{

475

__mnt_drop_write(file->f_path.mnt);

475

__mnt_drop_write(file->f_path.mnt);

476

}

476

}

477

478

void mnt_drop_write_file(struct file *file)

478

void mnt_drop_write_file(struct file *file)

479

{

479

{

480

mnt_drop_write(file->f_path.mnt);

480

mnt_drop_write(file->f_path.mnt);

481

}

481

}

482

EXPORT_SYMBOL(mnt_drop_write_file);

482

EXPORT_SYMBOL(mnt_drop_write_file);

483

484

static int mnt_make_readonly(struct mount *mnt)

484

static int mnt_make_readonly(struct mount *mnt)

485

{

485

{

486

int ret = 0;

486

int ret = 0;

487

488

lock_mount_hash();

488

lock_mount_hash();

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

490

/*

490

/*

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

492

* should be visible before we do.

492

* should be visible before we do.

493

*/

493

*/

494

smp_mb();

494

smp_mb();

495

496

/*

496

/*

497

* With writers on hold, if this value is zero, then there are

497

* With writers on hold, if this value is zero, then there are

498

* definitely no active writers (although held writers may subsequently

498

* definitely no active writers (although held writers may subsequently

499

* increment the count, they'll have to wait, and decrement it after

499

* increment the count, they'll have to wait, and decrement it after

500

* seeing MNT_READONLY).

500

* seeing MNT_READONLY).

501

*

501

*

502

* It is OK to have counter incremented on one CPU and decremented on

502

* It is OK to have counter incremented on one CPU and decremented on

503

* another: the sum will add up correctly. The danger would be when we

503

* another: the sum will add up correctly. The danger would be when we

504

* sum up each counter, if we read a counter before it is incremented,

504

* sum up each counter, if we read a counter before it is incremented,

505

* but then read another CPU's count which it has been subsequently

505

* but then read another CPU's count which it has been subsequently

506

* decremented from -- we would see more decrements than we should.

506

* decremented from -- we would see more decrements than we should.

507

* MNT_WRITE_HOLD protects against this scenario, because

507

* MNT_WRITE_HOLD protects against this scenario, because

508

* mnt_want_write first increments count, then smp_mb, then spins on

508

* mnt_want_write first increments count, then smp_mb, then spins on

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

510

* we're counting up here.

510

* we're counting up here.

511

*/

511

*/

512

if (mnt_get_writers(mnt) > 0)

512

if (mnt_get_writers(mnt) > 0)

513

ret = -EBUSY;

513

ret = -EBUSY;

514

else

514

else

515

mnt->mnt.mnt_flags |= MNT_READONLY;

515

mnt->mnt.mnt_flags |= MNT_READONLY;

516

/*

516

/*

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

518

* that become unheld will see MNT_READONLY.

518

* that become unheld will see MNT_READONLY.

519

*/

519

*/

520

smp_wmb();

520

smp_wmb();

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

522

unlock_mount_hash();

522

unlock_mount_hash();

523

return ret;

523

return ret;

524

}

524

}

525

526

static void __mnt_unmake_readonly(struct mount *mnt)

526

static void __mnt_unmake_readonly(struct mount *mnt)

527

{

527

{

528

lock_mount_hash();

528

lock_mount_hash();

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

530

unlock_mount_hash();

530

unlock_mount_hash();

531

}

531

}

532

533

int sb_prepare_remount_readonly(struct super_block *sb)

533

int sb_prepare_remount_readonly(struct super_block *sb)

534

{

534

{

535

struct mount *mnt;

535

struct mount *mnt;

536

int err = 0;

536

int err = 0;

537

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

539

if (atomic_long_read(&sb->s_remove_count))

539

if (atomic_long_read(&sb->s_remove_count))

540

return -EBUSY;

540

return -EBUSY;

541

542

lock_mount_hash();

542

lock_mount_hash();

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

546

smp_mb();

546

smp_mb();

547

if (mnt_get_writers(mnt) > 0) {

547

if (mnt_get_writers(mnt) > 0) {

548

err = -EBUSY;

548

err = -EBUSY;

549

break;

549

break;

550

}

550

}

551

}

551

}

552

}

552

}

553

if (!err && atomic_long_read(&sb->s_remove_count))

553

if (!err && atomic_long_read(&sb->s_remove_count))

554

err = -EBUSY;

554

err = -EBUSY;

555

556

if (!err) {

556

if (!err) {

557

sb->s_readonly_remount = 1;

557

sb->s_readonly_remount = 1;

558

smp_wmb();

558

smp_wmb();

559

}

559

}

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

563

}

563

}

564

unlock_mount_hash();

564

unlock_mount_hash();

565

566

return err;

566

return err;

567

}

567

}

568

569

static void free_vfsmnt(struct mount *mnt)

569

static void free_vfsmnt(struct mount *mnt)

570

{

570

{

571

kfree(mnt->mnt_devname);

571

kfree(mnt->mnt_devname);

572

#ifdef CONFIG_SMP

572

#ifdef CONFIG_SMP

573

free_percpu(mnt->mnt_pcp);

573

free_percpu(mnt->mnt_pcp);

574

#endif

574

#endif

575

kmem_cache_free(mnt_cache, mnt);

575

kmem_cache_free(mnt_cache, mnt);

576

}

576

}

577

578

static void delayed_free_vfsmnt(struct rcu_head *head)

578

static void delayed_free_vfsmnt(struct rcu_head *head)

579

{

579

{

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

581

}

581

}

582

583

/* call under rcu_read_lock */

583

/* call under rcu_read_lock */

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

585

{

585

{

586

struct mount *mnt;

586

struct mount *mnt;

587

if (read_seqretry(&mount_lock, seq))

587

if (read_seqretry(&mount_lock, seq))

588

return false;

588

return false;

589

if (bastard == NULL)

589

if (bastard == NULL)

590

return true;

590

return true;

591

mnt = real_mount(bastard);

591

mnt = real_mount(bastard);

592

mnt_add_count(mnt, 1);

592

mnt_add_count(mnt, 1);

593

if (likely(!read_seqretry(&mount_lock, seq)))

593

if (likely(!read_seqretry(&mount_lock, seq)))

594

return true;

594

return true;

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

596

mnt_add_count(mnt, -1);

596

mnt_add_count(mnt, -1);

597

return false;

597

return false;

598

}

598

}

599

rcu_read_unlock();

599

rcu_read_unlock();

600

mntput(bastard);

600

mntput(bastard);

601

rcu_read_lock();

601

rcu_read_lock();

602

return false;

602

return false;

603

}

603

}

604

605

/*

605

/*

606

* find the first mount at @dentry on vfsmount @mnt.

606

* find the first mount at @dentry on vfsmount @mnt.

607

* call under rcu_read_lock()

607

* call under rcu_read_lock()

608

*/

608

*/

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

610

{

610

{

611

struct hlist_head *head = m_hash(mnt, dentry);

611

struct hlist_head *head = m_hash(mnt, dentry);

612

struct mount *p;

612

struct mount *p;

613

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

616

return p;

616

return p;

617

return NULL;

617

return NULL;

618

}

618

}

619

620

/*

620

/*

621

* find the last mount at @dentry on vfsmount @mnt.

621

* find the last mount at @dentry on vfsmount @mnt.

622

* mount_lock must be held.

622

* mount_lock must be held.

623

*/

623

*/

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

625

{

625

{

626

struct mount *p, *res;

626

struct mount *p, *res;

627

res = p = __lookup_mnt(mnt, dentry);

627

res = p = __lookup_mnt(mnt, dentry);

628

if (!p)

628

if (!p)

629

goto out;

629

goto out;

630

hlist_for_each_entry_continue(p, mnt_hash) {

630

hlist_for_each_entry_continue(p, mnt_hash) {

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

632

break;

632

break;

633

res = p;

633

res = p;

634

}

634

}

635

out:

635

out:

636

return res;

636

return res;

637

}

637

}

638

639

/*

639

/*

640

* lookup_mnt - Return the first child mount mounted at path

640

* lookup_mnt - Return the first child mount mounted at path

641

*

641

*

642

* "First" means first mounted chronologically. If you create the

642

* "First" means first mounted chronologically. If you create the

643

* following mounts:

643

* following mounts:

644

*

644

*

645

* mount /dev/sda1 /mnt

645

* mount /dev/sda1 /mnt

646

* mount /dev/sda2 /mnt

646

* mount /dev/sda2 /mnt

647

* mount /dev/sda3 /mnt

647

* mount /dev/sda3 /mnt

648

*

648

*

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

650

* return successively the root dentry and vfsmount of /dev/sda1, then

650

* return successively the root dentry and vfsmount of /dev/sda1, then

651

* /dev/sda2, then /dev/sda3, then NULL.

651

* /dev/sda2, then /dev/sda3, then NULL.

652

*

652

*

653

* lookup_mnt takes a reference to the found vfsmount.

653

* lookup_mnt takes a reference to the found vfsmount.

654

*/

654

*/

655

struct vfsmount *lookup_mnt(struct path *path)

655

struct vfsmount *lookup_mnt(struct path *path)

656

{

656

{

657

struct mount *child_mnt;

657

struct mount *child_mnt;

658

struct vfsmount *m;

658

struct vfsmount *m;

659

unsigned seq;

659

unsigned seq;

660

661

rcu_read_lock();

661

rcu_read_lock();

662

do {

662

do {

663

seq = read_seqbegin(&mount_lock);

663

seq = read_seqbegin(&mount_lock);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

665

m = child_mnt ? &child_mnt->mnt : NULL;

665

m = child_mnt ? &child_mnt->mnt : NULL;

666

} while (!legitimize_mnt(m, seq));

666

} while (!legitimize_mnt(m, seq));

667

rcu_read_unlock();

667

rcu_read_unlock();

668

return m;

668

return m;

669

}

669

}

670

671

/*

671

/*

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

673

* current mount namespace.

673

* current mount namespace.

674

*

674

*

675

* The common case is dentries are not mountpoints at all and that

675

* The common case is dentries are not mountpoints at all and that

676

* test is handled inline. For the slow case when we are actually

676

* test is handled inline. For the slow case when we are actually

677

* dealing with a mountpoint of some kind, walk through all of the

677

* dealing with a mountpoint of some kind, walk through all of the

678

* mounts in the current mount namespace and test to see if the dentry

678

* mounts in the current mount namespace and test to see if the dentry

679

* is a mountpoint.

679

* is a mountpoint.

680

*

680

*

681

* The mount_hashtable is not usable in the context because we

681

* The mount_hashtable is not usable in the context because we

682

* need to identify all mounts that may be in the current mount

682

* need to identify all mounts that may be in the current mount

683

* namespace not just a mount that happens to have some specified

683

* namespace not just a mount that happens to have some specified

684

* parent mount.

684

* parent mount.

685

*/

685

*/

686

bool __is_local_mountpoint(struct dentry *dentry)

686

bool __is_local_mountpoint(struct dentry *dentry)

687

{

687

{

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

689

struct mount *mnt;

689

struct mount *mnt;

690

bool is_covered = false;

690

bool is_covered = false;

691

692

if (!d_mountpoint(dentry))

692

if (!d_mountpoint(dentry))

693

goto out;

693

goto out;

694

695

down_read(&namespace_sem);

695

down_read(&namespace_sem);

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

697

is_covered = (mnt->mnt_mountpoint == dentry);

697

is_covered = (mnt->mnt_mountpoint == dentry);

698

if (is_covered)

698

if (is_covered)

699

break;

699

break;

700

}

700

}

701

up_read(&namespace_sem);

701

up_read(&namespace_sem);

702

out:

702

out:

703

return is_covered;

703

return is_covered;

704

}

704

}

705

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

707

{

707

{

708

struct hlist_head *chain = mp_hash(dentry);

708

struct hlist_head *chain = mp_hash(dentry);

709

struct mountpoint *mp;

709

struct mountpoint *mp;

710

711

hlist_for_each_entry(mp, chain, m_hash) {

711

hlist_for_each_entry(mp, chain, m_hash) {

712

if (mp->m_dentry == dentry) {

712

if (mp->m_dentry == dentry) {

713

/* might be worth a WARN_ON() */

713

/* might be worth a WARN_ON() */

714

if (d_unlinked(dentry))

714

if (d_unlinked(dentry))

715

return ERR_PTR(-ENOENT);

715

return ERR_PTR(-ENOENT);

716

mp->m_count++;

716

mp->m_count++;

717

return mp;

717

return mp;

718

}

718

}

719

}

719

}

720

return NULL;

720

return NULL;

721

}

721

}

722

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

724

{

724

{

725

struct hlist_head *chain = mp_hash(dentry);

725

struct hlist_head *chain = mp_hash(dentry);

726

struct mountpoint *mp;

726

struct mountpoint *mp;

727

int ret;

727

int ret;

728

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

730

if (!mp)

730

if (!mp)

731

return ERR_PTR(-ENOMEM);

731

return ERR_PTR(-ENOMEM);

732

733

ret = d_set_mounted(dentry);

733

ret = d_set_mounted(dentry);

734

if (ret) {

734

if (ret) {

735

kfree(mp);

735

kfree(mp);

736

return ERR_PTR(ret);

736

return ERR_PTR(ret);

737

}

737

}

738

739

mp->m_dentry = dentry;

739

mp->m_dentry = dentry;

740

mp->m_count = 1;

740

mp->m_count = 1;

741

hlist_add_head(&mp->m_hash, chain);

741

hlist_add_head(&mp->m_hash, chain);

742

INIT_HLIST_HEAD(&mp->m_list);

742

INIT_HLIST_HEAD(&mp->m_list);

743

return mp;

743

return mp;

744

}

744

}

745

746

static void put_mountpoint(struct mountpoint *mp)

746

static void put_mountpoint(struct mountpoint *mp)

747

{

747

{

748

if (!--mp->m_count) {

748

if (!--mp->m_count) {

749

struct dentry *dentry = mp->m_dentry;

749

struct dentry *dentry = mp->m_dentry;

750

BUG_ON(!hlist_empty(&mp->m_list));

750

BUG_ON(!hlist_empty(&mp->m_list));

751

spin_lock(&dentry->d_lock);

751

spin_lock(&dentry->d_lock);

752

dentry->d_flags &= ~DCACHE_MOUNTED;

752

dentry->d_flags &= ~DCACHE_MOUNTED;

753

spin_unlock(&dentry->d_lock);

753

spin_unlock(&dentry->d_lock);

754

hlist_del(&mp->m_hash);

754

hlist_del(&mp->m_hash);

755

kfree(mp);

755

kfree(mp);

756

}

756

}

757

}

757

}

758

759

static inline int check_mnt(struct mount *mnt)

759

static inline int check_mnt(struct mount *mnt)

760

{

760

{

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

762

}

762

}

763

764

/*

764

/*

765

* vfsmount lock must be held for write

765

* vfsmount lock must be held for write

766

*/

766

*/

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

768

{

768

{

769

if (ns) {

769

if (ns) {

770

ns->event = ++event;

770

ns->event = ++event;

771

wake_up_interruptible(&ns->poll);

771

wake_up_interruptible(&ns->poll);

772

}

772

}

773

}

773

}

774

775

/*

775

/*

776

* vfsmount lock must be held for write

776

* vfsmount lock must be held for write

777

*/

777

*/

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

779

{

779

{

780

if (ns && ns->event != event) {

780

if (ns && ns->event != event) {

781

ns->event = event;

781

ns->event = event;

782

wake_up_interruptible(&ns->poll);

782

wake_up_interruptible(&ns->poll);

783

}

783

}

784

}

784

}

785

786

/*

786

/*

787

* vfsmount lock must be held for write

787

* vfsmount lock must be held for write

788

*/

788

*/

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

790

{

790

{

791

old_path->dentry = mnt->mnt_mountpoint;

791

old_path->dentry = mnt->mnt_mountpoint;

792

old_path->mnt = &mnt->mnt_parent->mnt;

792

old_path->mnt = &mnt->mnt_parent->mnt;

793

mnt->mnt_parent = mnt;

793

mnt->mnt_parent = mnt;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

795

list_del_init(&mnt->mnt_child);

795

list_del_init(&mnt->mnt_child);

796

hlist_del_init_rcu(&mnt->mnt_hash);

796

hlist_del_init_rcu(&mnt->mnt_hash);

797

hlist_del_init(&mnt->mnt_mp_list);

797

hlist_del_init(&mnt->mnt_mp_list);

798

put_mountpoint(mnt->mnt_mp);

798

put_mountpoint(mnt->mnt_mp);

799

mnt->mnt_mp = NULL;

799

mnt->mnt_mp = NULL;

800

}

800

}

801

802

/*

802

/*

803

* vfsmount lock must be held for write

803

* vfsmount lock must be held for write

804

*/

804

*/

805

void mnt_set_mountpoint(struct mount *mnt,

805

void mnt_set_mountpoint(struct mount *mnt,

806

struct mountpoint *mp,

806

struct mountpoint *mp,

807

struct mount *child_mnt)

807

struct mount *child_mnt)

808

{

808

{

809

mp->m_count++;

809

mp->m_count++;

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

812

child_mnt->mnt_parent = mnt;

812

child_mnt->mnt_parent = mnt;

813

child_mnt->mnt_mp = mp;

813

child_mnt->mnt_mp = mp;

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

815

}

815

}

816

817

/*

817

/*

818

* vfsmount lock must be held for write

818

* vfsmount lock must be held for write

819

*/

819

*/

820

static void attach_mnt(struct mount *mnt,

820

static void attach_mnt(struct mount *mnt,

821

struct mount *parent,

821

struct mount *parent,

822

struct mountpoint *mp)

822

struct mountpoint *mp)

823

{

823

{

824

mnt_set_mountpoint(parent, mp, mnt);

824

mnt_set_mountpoint(parent, mp, mnt);

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

827

}

827

}

828

829

static void attach_shadowed(struct mount *mnt,

829

static void attach_shadowed(struct mount *mnt,

830

struct mount *parent,

830

struct mount *parent,

831

struct mount *shadows)

831

struct mount *shadows)

832

{

832

{

833

if (shadows) {

833

if (shadows) {

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

836

} else {

836

} else {

837

hlist_add_head_rcu(&mnt->mnt_hash,

837

hlist_add_head_rcu(&mnt->mnt_hash,

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

840

}

840

}

841

}

841

}

842

843

/*

843

/*

844

* vfsmount lock must be held for write

844

* vfsmount lock must be held for write

845

*/

845

*/

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

847

{

847

{

848

struct mount *parent = mnt->mnt_parent;

848

struct mount *parent = mnt->mnt_parent;

849

struct mount *m;

849

struct mount *m;

850

LIST_HEAD(head);

850

LIST_HEAD(head);

851

struct mnt_namespace *n = parent->mnt_ns;

851

struct mnt_namespace *n = parent->mnt_ns;

852

853

BUG_ON(parent == mnt);

853

BUG_ON(parent == mnt);

854

855

list_add_tail(&head, &mnt->mnt_list);

855

list_add_tail(&head, &mnt->mnt_list);

856

list_for_each_entry(m, &head, mnt_list)

856

list_for_each_entry(m, &head, mnt_list)

857

m->mnt_ns = n;

857

m->mnt_ns = n;

858

859

list_splice(&head, n->list.prev);

859

list_splice(&head, n->list.prev);

860

861

attach_shadowed(mnt, parent, shadows);

861

attach_shadowed(mnt, parent, shadows);

862

touch_mnt_namespace(n);

862

touch_mnt_namespace(n);

863

}

863

}

864

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

866

{

866

{

867

struct list_head *next = p->mnt_mounts.next;

867

struct list_head *next = p->mnt_mounts.next;

868

if (next == &p->mnt_mounts) {

868

if (next == &p->mnt_mounts) {

869

while (1) {

869

while (1) {

870

if (p == root)

870

if (p == root)

871

return NULL;

871

return NULL;

872

next = p->mnt_child.next;

872

next = p->mnt_child.next;

873

if (next != &p->mnt_parent->mnt_mounts)

873

if (next != &p->mnt_parent->mnt_mounts)

874

break;

874

break;

875

p = p->mnt_parent;

875

p = p->mnt_parent;

876

}

876

}

877

}

877

}

878

return list_entry(next, struct mount, mnt_child);

878

return list_entry(next, struct mount, mnt_child);

879

}

879

}

880

881

static struct mount *skip_mnt_tree(struct mount *p)

881

static struct mount *skip_mnt_tree(struct mount *p)

882

{

882

{

883

struct list_head *prev = p->mnt_mounts.prev;

883

struct list_head *prev = p->mnt_mounts.prev;

884

while (prev != &p->mnt_mounts) {

884

while (prev != &p->mnt_mounts) {

885

p = list_entry(prev, struct mount, mnt_child);

885

p = list_entry(prev, struct mount, mnt_child);

886

prev = p->mnt_mounts.prev;

886

prev = p->mnt_mounts.prev;

887

}

887

}

888

return p;

888

return p;

889

}

889

}

890

891

struct vfsmount *

891

struct vfsmount *

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

893

{

893

{

894

struct mount *mnt;

894

struct mount *mnt;

895

struct dentry *root;

895

struct dentry *root;

896

897

if (!type)

897

if (!type)

898

return ERR_PTR(-ENODEV);

898

return ERR_PTR(-ENODEV);

899

900

mnt = alloc_vfsmnt(name);

900

mnt = alloc_vfsmnt(name);

901

if (!mnt)

901

if (!mnt)

902

return ERR_PTR(-ENOMEM);

902

return ERR_PTR(-ENOMEM);

903

904

if (flags & MS_KERNMOUNT)

904

if (flags & MS_KERNMOUNT)

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

906

907

root = mount_fs(type, flags, name, data);

907

root = mount_fs(type, flags, name, data);

908

if (IS_ERR(root)) {

908

if (IS_ERR(root)) {

909

mnt_free_id(mnt);

909

mnt_free_id(mnt);

910

free_vfsmnt(mnt);

910

free_vfsmnt(mnt);

911

return ERR_CAST(root);

911

return ERR_CAST(root);

912

}

912

}

913

914

mnt->mnt.mnt_root = root;

914

mnt->mnt.mnt_root = root;

915

mnt->mnt.mnt_sb = root->d_sb;

915

mnt->mnt.mnt_sb = root->d_sb;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

917

mnt->mnt_parent = mnt;

917

mnt->mnt_parent = mnt;

918

lock_mount_hash();

918

lock_mount_hash();

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

920

unlock_mount_hash();

920

unlock_mount_hash();

921

return &mnt->mnt;

921

return &mnt->mnt;

922

}

922

}

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

924

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

926

int flag)

926

int flag)

927

{

927

{

928

struct super_block *sb = old->mnt.mnt_sb;

928

struct super_block *sb = old->mnt.mnt_sb;

929

struct mount *mnt;

929

struct mount *mnt;

930

int err;

930

int err;

931

932

mnt = alloc_vfsmnt(old->mnt_devname);

932

mnt = alloc_vfsmnt(old->mnt_devname);

933

if (!mnt)

933

if (!mnt)

934

return ERR_PTR(-ENOMEM);

934

return ERR_PTR(-ENOMEM);

935

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

937

mnt->mnt_group_id = 0; /* not a peer of original */

937

mnt->mnt_group_id = 0; /* not a peer of original */

938

else

938

else

939

mnt->mnt_group_id = old->mnt_group_id;

939

mnt->mnt_group_id = old->mnt_group_id;

940

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

942

err = mnt_alloc_group_id(mnt);

942

err = mnt_alloc_group_id(mnt);

943

if (err)

943

if (err)

944

goto out_free;

944

goto out_free;

945

}

945

}

946

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

948

/* Don't allow unprivileged users to change mount flags */

948

/* Don't allow unprivileged users to change mount flags */

949

if (flag & CL_UNPRIVILEGED) {

949

if (flag & CL_UNPRIVILEGED) {

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

951

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

954

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

957

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

960

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

963

}

963

}

964

965

/* Don't allow unprivileged users to reveal what is under a mount */

965

/* Don't allow unprivileged users to reveal what is under a mount */

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

968

969

atomic_inc(&sb->s_active);

969

atomic_inc(&sb->s_active);

970

mnt->mnt.mnt_sb = sb;

970

mnt->mnt.mnt_sb = sb;

971

mnt->mnt.mnt_root = dget(root);

971

mnt->mnt.mnt_root = dget(root);

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

973

mnt->mnt_parent = mnt;

973

mnt->mnt_parent = mnt;

974

lock_mount_hash();

974

lock_mount_hash();

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

976

unlock_mount_hash();

976

unlock_mount_hash();

977

978

if ((flag & CL_SLAVE) ||

978

if ((flag & CL_SLAVE) ||

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

981

mnt->mnt_master = old;

981

mnt->mnt_master = old;

982

CLEAR_MNT_SHARED(mnt);

982

CLEAR_MNT_SHARED(mnt);

983

} else if (!(flag & CL_PRIVATE)) {

983

} else if (!(flag & CL_PRIVATE)) {

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

985

list_add(&mnt->mnt_share, &old->mnt_share);

985

list_add(&mnt->mnt_share, &old->mnt_share);

986

if (IS_MNT_SLAVE(old))

986

if (IS_MNT_SLAVE(old))

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

988

mnt->mnt_master = old->mnt_master;

988

mnt->mnt_master = old->mnt_master;

989

}

989

}

990

if (flag & CL_MAKE_SHARED)

990

if (flag & CL_MAKE_SHARED)

991

set_mnt_shared(mnt);

991

set_mnt_shared(mnt);

992

993

/* stick the duplicate mount on the same expiry list

993

/* stick the duplicate mount on the same expiry list

994

* as the original if that was on one */

994

* as the original if that was on one */

995

if (flag & CL_EXPIRE) {

995

if (flag & CL_EXPIRE) {

996

if (!list_empty(&old->mnt_expire))

996

if (!list_empty(&old->mnt_expire))

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

998

}

998

}

999

1000

return mnt;

1000

return mnt;

1001

1002

out_free:

1002

out_free:

1003

mnt_free_id(mnt);

1003

mnt_free_id(mnt);

1004

free_vfsmnt(mnt);

1004

free_vfsmnt(mnt);

1005

return ERR_PTR(err);

1005

return ERR_PTR(err);

1006

}

1006

}

1007

1008

static void cleanup_mnt(struct mount *mnt)

1008

static void cleanup_mnt(struct mount *mnt)

1009

{

1009

{

1010

/*

1010

/*

1011

* This probably indicates that somebody messed

1011

* This probably indicates that somebody messed

1012

* up a mnt_want/drop_write() pair. If this

1012

* up a mnt_want/drop_write() pair. If this

1013

* happens, the filesystem was probably unable

1013

* happens, the filesystem was probably unable

1014

* to make r/w->r/o transitions.

1014

* to make r/w->r/o transitions.

1015

*/

1015

*/

1016

/*

1016

/*

1017

* The locking used to deal with mnt_count decrement provides barriers,

1017

* The locking used to deal with mnt_count decrement provides barriers,

1018

* so mnt_get_writers() below is safe.

1018

* so mnt_get_writers() below is safe.

1019

*/

1019

*/

1020

WARN_ON(mnt_get_writers(mnt));

1020

WARN_ON(mnt_get_writers(mnt));

1021

if (unlikely(mnt->mnt_pins.first))

1021

if (unlikely(mnt->mnt_pins.first))

1022

mnt_pin_kill(mnt);

1022

mnt_pin_kill(mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1024

dput(mnt->mnt.mnt_root);

1024

dput(mnt->mnt.mnt_root);

1025

deactivate_super(mnt->mnt.mnt_sb);

1025

deactivate_super(mnt->mnt.mnt_sb);

1026

mnt_free_id(mnt);

1026

mnt_free_id(mnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1028

}

1028

}

1029

1030

static void __cleanup_mnt(struct rcu_head *head)

1030

static void __cleanup_mnt(struct rcu_head *head)

1031

{

1031

{

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1033

}

1033

}

1034

1035

static LLIST_HEAD(delayed_mntput_list);

1035

static LLIST_HEAD(delayed_mntput_list);

1036

static void delayed_mntput(struct work_struct *unused)

1036

static void delayed_mntput(struct work_struct *unused)

1037

{

1037

{

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1039

struct llist_node *next;

1039

struct llist_node *next;

1040

1041

for (; node; node = next) {

1041

for (; node; node = next) {

1042

next = llist_next(node);

1042

next = llist_next(node);

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1044

}

1044

}

1045

}

1045

}

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1047

1048

static void mntput_no_expire(struct mount *mnt)

1048

static void mntput_no_expire(struct mount *mnt)

1049

{

1049

{

1050

rcu_read_lock();

1050

rcu_read_lock();

1051

mnt_add_count(mnt, -1);

1051

mnt_add_count(mnt, -1);

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1053

rcu_read_unlock();

1053

rcu_read_unlock();

1054

return;

1054

return;

1055

}

1055

}

1056

lock_mount_hash();

1056

lock_mount_hash();

1057

if (mnt_get_count(mnt)) {

1057

if (mnt_get_count(mnt)) {

1058

rcu_read_unlock();

1058

rcu_read_unlock();

1059

unlock_mount_hash();

1059

unlock_mount_hash();

1060

return;

1060

return;

1061

}

1061

}

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1063

rcu_read_unlock();

1063

rcu_read_unlock();

1064

unlock_mount_hash();

1064

unlock_mount_hash();

1065

return;

1065

return;

1066

}

1066

}

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1068

rcu_read_unlock();

1068

rcu_read_unlock();

1069

1070

list_del(&mnt->mnt_instance);

1070

list_del(&mnt->mnt_instance);

1071

unlock_mount_hash();

1071

unlock_mount_hash();

1072

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1074

struct task_struct *task = current;

1074

struct task_struct *task = current;

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1078

return;

1078

return;

1079

}

1079

}

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1082

return;

1082

return;

1083

}

1083

}

1084

cleanup_mnt(mnt);

1084

cleanup_mnt(mnt);

1085

}

1085

}

1086

1087

void mntput(struct vfsmount *mnt)

1087

void mntput(struct vfsmount *mnt)

1088

{

1088

{

1089

if (mnt) {

1089

if (mnt) {

1090

struct mount *m = real_mount(mnt);

1090

struct mount *m = real_mount(mnt);

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1092

if (unlikely(m->mnt_expiry_mark))

1092

if (unlikely(m->mnt_expiry_mark))

1093

m->mnt_expiry_mark = 0;

1093

m->mnt_expiry_mark = 0;

1094

mntput_no_expire(m);

1094

mntput_no_expire(m);

1095

}

1095

}

1096

}

1096

}

1097

EXPORT_SYMBOL(mntput);

1097

EXPORT_SYMBOL(mntput);

1098

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1100

{

1100

{

1101

if (mnt)

1101

if (mnt)

1102

mnt_add_count(real_mount(mnt), 1);

1102

mnt_add_count(real_mount(mnt), 1);

1103

return mnt;

1103

return mnt;

1104

}

1104

}

1105

EXPORT_SYMBOL(mntget);

1105

EXPORT_SYMBOL(mntget);

1106

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1108

{

1108

{

1109

struct mount *p;

1109

struct mount *p;

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1111

if (IS_ERR(p))

1111

if (IS_ERR(p))

1112

return ERR_CAST(p);

1112

return ERR_CAST(p);

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1114

return &p->mnt;

1114

return &p->mnt;

1115

}

1115

}

1116

1117

static inline void mangle(struct seq_file *m, const char *s)

1117

static inline void mangle(struct seq_file *m, const char *s)

1118

{

1118

{

1119

seq_escape(m, s, " \t\n\\");

1119

seq_escape(m, s, " \t\n\\");

1120

}

1120

}

1121

1122

/*

1122

/*

1123

* Simple .show_options callback for filesystems which don't want to

1123

* Simple .show_options callback for filesystems which don't want to

1124

* implement more complex mount option showing.

1124

* implement more complex mount option showing.

1125

*

1125

*

1126

* See also save_mount_options().

1126

* See also save_mount_options().

1127

*/

1127

*/

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1129

{

1129

{

1130

const char *options;

1130

const char *options;

1131

1132

rcu_read_lock();

1132

rcu_read_lock();

1133

options = rcu_dereference(root->d_sb->s_options);

1133

options = rcu_dereference(root->d_sb->s_options);

1134

1135

if (options != NULL && options[0]) {

1135

if (options != NULL && options[0]) {

1136

seq_putc(m, ',');

1136

seq_putc(m, ',');

1137

mangle(m, options);

1137

mangle(m, options);

1138

}

1138

}

1139

rcu_read_unlock();

1139

rcu_read_unlock();

1140

1141

return 0;

1141

return 0;

1142

}

1142

}

1143

EXPORT_SYMBOL(generic_show_options);

1143

EXPORT_SYMBOL(generic_show_options);

1144

1145

/*

1145

/*

1146

* If filesystem uses generic_show_options(), this function should be

1146

* If filesystem uses generic_show_options(), this function should be

1147

* called from the fill_super() callback.

1147

* called from the fill_super() callback.

1148

*

1148

*

1149

* The .remount_fs callback usually needs to be handled in a special

1149

* The .remount_fs callback usually needs to be handled in a special

1150

* way, to make sure, that previous options are not overwritten if the

1150

* way, to make sure, that previous options are not overwritten if the

1151

* remount fails.

1151

* remount fails.

1152

*

1152

*

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1154

* reset all options to their default value, but changes only newly

1154

* reset all options to their default value, but changes only newly

1155

* given options, then the displayed options will not reflect reality

1155

* given options, then the displayed options will not reflect reality

1156

* any more.

1156

* any more.

1157

*/

1157

*/

1158

void save_mount_options(struct super_block *sb, char *options)

1158

void save_mount_options(struct super_block *sb, char *options)

1159

{

1159

{

1160

BUG_ON(sb->s_options);

1160

BUG_ON(sb->s_options);

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1162

}

1162

}

1163

EXPORT_SYMBOL(save_mount_options);

1163

EXPORT_SYMBOL(save_mount_options);

1164

1165

void replace_mount_options(struct super_block *sb, char *options)

1165

void replace_mount_options(struct super_block *sb, char *options)

1166

{

1166

{

1167

char *old = sb->s_options;

1167

char *old = sb->s_options;

1168

rcu_assign_pointer(sb->s_options, options);

1168

rcu_assign_pointer(sb->s_options, options);

1169

if (old) {

1169

if (old) {

1170

synchronize_rcu();

1170

synchronize_rcu();

1171

kfree(old);

1171

kfree(old);

1172

}

1172

}

1173

}

1173

}

1174

EXPORT_SYMBOL(replace_mount_options);

1174

EXPORT_SYMBOL(replace_mount_options);

1175

1176

#ifdef CONFIG_PROC_FS

1176

#ifdef CONFIG_PROC_FS

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1179

{

1179

{

1180

struct proc_mounts *p = proc_mounts(m);

1180

struct proc_mounts *p = proc_mounts(m);

1181

1182

down_read(&namespace_sem);

1182

down_read(&namespace_sem);

1183

if (p->cached_event == p->ns->event) {

1183

if (p->cached_event == p->ns->event) {

1184

void *v = p->cached_mount;

1184

void *v = p->cached_mount;

1185

if (*pos == p->cached_index)

1185

if (*pos == p->cached_index)

1186

return v;

1186

return v;

1187

if (*pos == p->cached_index + 1) {

1187

if (*pos == p->cached_index + 1) {

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1189

return p->cached_mount = v;

1189

return p->cached_mount = v;

1190

}

1190

}

1191

}

1191

}

1192

1193

p->cached_event = p->ns->event;

1193

p->cached_event = p->ns->event;

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1195

p->cached_index = *pos;

1195

p->cached_index = *pos;

1196

return p->cached_mount;

1196

return p->cached_mount;

1197

}

1197

}

1198

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1200

{

1200

{

1201

struct proc_mounts *p = proc_mounts(m);

1201

struct proc_mounts *p = proc_mounts(m);

1202

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1204

p->cached_index = *pos;

1204

p->cached_index = *pos;

1205

return p->cached_mount;

1205

return p->cached_mount;

1206

}

1206

}

1207

1208

static void m_stop(struct seq_file *m, void *v)

1208

static void m_stop(struct seq_file *m, void *v)

1209

{

1209

{

1210

up_read(&namespace_sem);

1210

up_read(&namespace_sem);

1211

}

1211

}

1212

1213

static int m_show(struct seq_file *m, void *v)

1213

static int m_show(struct seq_file *m, void *v)

1214

{

1214

{

1215

struct proc_mounts *p = proc_mounts(m);

1215

struct proc_mounts *p = proc_mounts(m);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1217

return p->show(m, &r->mnt);

1217

return p->show(m, &r->mnt);

1218

}

1218

}

1219

1220

const struct seq_operations mounts_op = {

1220

const struct seq_operations mounts_op = {

1221

.start = m_start,

1221

.start = m_start,

1222

.next = m_next,

1222

.next = m_next,

1223

.stop = m_stop,

1223

.stop = m_stop,

1224

.show = m_show,

1224

.show = m_show,

1225

};

1225

};

1226

#endif /* CONFIG_PROC_FS */

1226

#endif /* CONFIG_PROC_FS */

1227

1228

/**

1228

/**

1229

* may_umount_tree - check if a mount tree is busy

1229

* may_umount_tree - check if a mount tree is busy

1230

* @mnt: root of mount tree

1230

* @mnt: root of mount tree

1231

*

1231

*

1232

* This is called to check if a tree of mounts has any

1232

* This is called to check if a tree of mounts has any

1233

* open files, pwds, chroots or sub mounts that are

1233

* open files, pwds, chroots or sub mounts that are

1234

* busy.

1234

* busy.

1235

*/

1235

*/

1236

int may_umount_tree(struct vfsmount *m)

1236

int may_umount_tree(struct vfsmount *m)

1237

{

1237

{

1238

struct mount *mnt = real_mount(m);

1238

struct mount *mnt = real_mount(m);

1239

int actual_refs = 0;

1239

int actual_refs = 0;

1240

int minimum_refs = 0;

1240

int minimum_refs = 0;

1241

struct mount *p;

1241

struct mount *p;

1242

BUG_ON(!m);

1242

BUG_ON(!m);

1243

1244

/* write lock needed for mnt_get_count */

1244

/* write lock needed for mnt_get_count */

1245

lock_mount_hash();

1245

lock_mount_hash();

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1247

actual_refs += mnt_get_count(p);

1247

actual_refs += mnt_get_count(p);

1248

minimum_refs += 2;

1248

minimum_refs += 2;

1249

}

1249

}

1250

unlock_mount_hash();

1250

unlock_mount_hash();

1251

1252

if (actual_refs > minimum_refs)

1252

if (actual_refs > minimum_refs)

1253

return 0;

1253

return 0;

1254

1255

return 1;

1255

return 1;

1256

}

1256

}

1257

1258

EXPORT_SYMBOL(may_umount_tree);

1258

EXPORT_SYMBOL(may_umount_tree);

1259

1260

/**

1260

/**

1261

* may_umount - check if a mount point is busy

1261

* may_umount - check if a mount point is busy

1262

* @mnt: root of mount

1262

* @mnt: root of mount

1263

*

1263

*

1264

* This is called to check if a mount point has any

1264

* This is called to check if a mount point has any

1265

* open files, pwds, chroots or sub mounts. If the

1265

* open files, pwds, chroots or sub mounts. If the

1266

* mount has sub mounts this will return busy

1266

* mount has sub mounts this will return busy

1267

* regardless of whether the sub mounts are busy.

1267

* regardless of whether the sub mounts are busy.

1268

*

1268

*

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1270

* give false negatives. The main reason why it's here is that we need

1270

* give false negatives. The main reason why it's here is that we need

1271

* a non-destructive way to look for easily umountable filesystems.

1271

* a non-destructive way to look for easily umountable filesystems.

1272

*/

1272

*/

1273

int may_umount(struct vfsmount *mnt)

1273

int may_umount(struct vfsmount *mnt)

1274

{

1274

{

1275

int ret = 1;

1275

int ret = 1;

1276

down_read(&namespace_sem);

1276

down_read(&namespace_sem);

1277

lock_mount_hash();

1277

lock_mount_hash();

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1279

ret = 0;

1279

ret = 0;

1280

unlock_mount_hash();

1280

unlock_mount_hash();

1281

up_read(&namespace_sem);

1281

up_read(&namespace_sem);

1282

return ret;

1282

return ret;

1283

}

1283

}

1284

1285

EXPORT_SYMBOL(may_umount);

1285

EXPORT_SYMBOL(may_umount);

1286

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1288

1289

static void namespace_unlock(void)

1289

static void namespace_unlock(void)

1290

{

1290

{

1291

struct mount *mnt;

1291

struct mount *mnt;

1292

struct hlist_head head = unmounted;

1292

struct hlist_head head = unmounted;

1293

1294

if (likely(hlist_empty(&head))) {

1294

if (likely(hlist_empty(&head))) {

1295

up_write(&namespace_sem);

1295

up_write(&namespace_sem);

1296

return;

1296

return;

1297

}

1297

}

1298

1299

head.first->pprev = &head.first;

1299

head.first->pprev = &head.first;

1300

INIT_HLIST_HEAD(&unmounted);

1300

INIT_HLIST_HEAD(&unmounted);

1301

1302

/* undo decrements we'd done in umount_tree() */

1302

/* undo decrements we'd done in umount_tree() */

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1306

1307

up_write(&namespace_sem);

1307

up_write(&namespace_sem);

1308

1309

synchronize_rcu();

1309

synchronize_rcu();

1310

1311

while (!hlist_empty(&head)) {

1311

while (!hlist_empty(&head)) {

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1314

if (mnt->mnt_ex_mountpoint.mnt)

1314

if (mnt->mnt_ex_mountpoint.mnt)

1315

path_put(&mnt->mnt_ex_mountpoint);

1315

path_put(&mnt->mnt_ex_mountpoint);

1316

mntput(&mnt->mnt);

1316

mntput(&mnt->mnt);

1317

}

1317

}

1318

}

1318

}

1319

1320

static inline void namespace_lock(void)

1320

static inline void namespace_lock(void)

1321

{

1321

{

1322

down_write(&namespace_sem);

1322

down_write(&namespace_sem);

1323

}

1323

}

1324

1325

/*

1325

/*

1326

* mount_lock must be held

1326

* mount_lock must be held

1327

* namespace_sem must be held for write

1327

* namespace_sem must be held for write

1328

* how = 0 => just this tree, don't propagate

1328

* how = 0 => just this tree, don't propagate

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1330

* how = 2 => lazy umount

1330

* how = 2 => lazy umount

1331

*/

1331

*/

1332

void umount_tree(struct mount *mnt, int how)

1332

void umount_tree(struct mount *mnt, int how)

1333

{

1333

{

1334

HLIST_HEAD(tmp_list);

1334

HLIST_HEAD(tmp_list);

1335

struct mount *p;

1335

struct mount *p;

1336

struct mount *last = NULL;

1336

struct mount *last = NULL;

1337

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1339

hlist_del_init_rcu(&p->mnt_hash);

1339

hlist_del_init_rcu(&p->mnt_hash);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1341

}

1341

}

1342

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1344

list_del_init(&p->mnt_child);

1344

list_del_init(&p->mnt_child);

1345

1346

if (how)

1346

if (how)

1347

propagate_umount(&tmp_list);

1347

propagate_umount(&tmp_list);

1348

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1350

list_del_init(&p->mnt_expire);

1350

list_del_init(&p->mnt_expire);

1351

list_del_init(&p->mnt_list);

1351

list_del_init(&p->mnt_list);

1352

__touch_mnt_namespace(p->mnt_ns);

1352

__touch_mnt_namespace(p->mnt_ns);

1353

p->mnt_ns = NULL;

1353

p->mnt_ns = NULL;

1354

if (how < 2)

1354

if (how < 2)

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1356

if (mnt_has_parent(p)) {

1356

if (mnt_has_parent(p)) {

1357

hlist_del_init(&p->mnt_mp_list);

1357

hlist_del_init(&p->mnt_mp_list);

1358

put_mountpoint(p->mnt_mp);

1358

put_mountpoint(p->mnt_mp);

1359

mnt_add_count(p->mnt_parent, -1);

1359

mnt_add_count(p->mnt_parent, -1);

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1364

p->mnt_parent = p;

1364

p->mnt_parent = p;

1365

p->mnt_mp = NULL;

1365

p->mnt_mp = NULL;

1366

}

1366

}

1367

change_mnt_propagation(p, MS_PRIVATE);

1367

change_mnt_propagation(p, MS_PRIVATE);

1368

last = p;

1368

last = p;

1369

}

1369

}

1370

if (last) {

1370

if (last) {

1371

last->mnt_hash.next = unmounted.first;

1371

last->mnt_hash.next = unmounted.first;

1372

if (unmounted.first)

1372

if (unmounted.first)

1373

unmounted.first->pprev = &last->mnt_hash.next;

1373

unmounted.first->pprev = &last->mnt_hash.next;

1374

unmounted.first = tmp_list.first;

1374

unmounted.first = tmp_list.first;

1375

unmounted.first->pprev = &unmounted.first;

1375

unmounted.first->pprev = &unmounted.first;

1376

}

1376

}

1377

}

1377

}

1378

1379

static void shrink_submounts(struct mount *mnt);

1379

static void shrink_submounts(struct mount *mnt);

1380

1381

static int do_umount(struct mount *mnt, int flags)

1381

static int do_umount(struct mount *mnt, int flags)

1382

{

1382

{

1383

struct super_block *sb = mnt->mnt.mnt_sb;

1383

struct super_block *sb = mnt->mnt.mnt_sb;

1384

int retval;

1384

int retval;

1385

1386

retval = security_sb_umount(&mnt->mnt, flags);

1386

retval = security_sb_umount(&mnt->mnt, flags);

1387

if (retval)

1387

if (retval)

1388

return retval;

1388

return retval;

1389

1390

/*

1390

/*

1391

* Allow userspace to request a mountpoint be expired rather than

1391

* Allow userspace to request a mountpoint be expired rather than

1392

* unmounting unconditionally. Unmount only happens if:

1392

* unmounting unconditionally. Unmount only happens if:

1393

* (1) the mark is already set (the mark is cleared by mntput())

1393

* (1) the mark is already set (the mark is cleared by mntput())

1394

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1394

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1395

*/

1395

*/

1396

if (flags & MNT_EXPIRE) {

1396

if (flags & MNT_EXPIRE) {

1397

if (&mnt->mnt == current->fs->root.mnt ||

1397

if (&mnt->mnt == current->fs->root.mnt ||

1398

flags & (MNT_FORCE | MNT_DETACH))

1398

flags & (MNT_FORCE | MNT_DETACH))

1399

return -EINVAL;

1399

return -EINVAL;

1400

1401

/*

1401

/*

1402

* probably don't strictly need the lock here if we examined

1402

* probably don't strictly need the lock here if we examined

1403

* all race cases, but it's a slowpath.

1403

* all race cases, but it's a slowpath.

1404

*/

1404

*/

1405

lock_mount_hash();

1405

lock_mount_hash();

1406

if (mnt_get_count(mnt) != 2) {

1406

if (mnt_get_count(mnt) != 2) {

1407

unlock_mount_hash();

1407

unlock_mount_hash();

1408

return -EBUSY;

1408

return -EBUSY;

1409

}

1409

}

1410

unlock_mount_hash();

1410

unlock_mount_hash();

1411

1412

if (!xchg(&mnt->mnt_expiry_mark, 1))

1412

if (!xchg(&mnt->mnt_expiry_mark, 1))

1413

return -EAGAIN;

1413

return -EAGAIN;

1414

}

1414

}

1415

1416

/*

1416

/*

1417

* If we may have to abort operations to get out of this

1417

* If we may have to abort operations to get out of this

1418

* mount, and they will themselves hold resources we must

1418

* mount, and they will themselves hold resources we must

1419

* allow the fs to do things. In the Unix tradition of

1419

* allow the fs to do things. In the Unix tradition of

1420

* 'Gee thats tricky lets do it in userspace' the umount_begin

1420

* 'Gee thats tricky lets do it in userspace' the umount_begin

1421

* might fail to complete on the first run through as other tasks

1421

* might fail to complete on the first run through as other tasks

1422

* must return, and the like. Thats for the mount program to worry

1422

* must return, and the like. Thats for the mount program to worry

1423

* about for the moment.

1423

* about for the moment.

1424

*/

1424

*/

1425

1426

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1426

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1427

sb->s_op->umount_begin(sb);

1427

sb->s_op->umount_begin(sb);

1428

}

1428

}

1429

1430

/*

1430

/*

1431

* No sense to grab the lock for this test, but test itself looks

1431

* No sense to grab the lock for this test, but test itself looks

1432

* somewhat bogus. Suggestions for better replacement?

1432

* somewhat bogus. Suggestions for better replacement?

1433

* Ho-hum... In principle, we might treat that as umount + switch

1433

* Ho-hum... In principle, we might treat that as umount + switch

1434

* to rootfs. GC would eventually take care of the old vfsmount.

1434

* to rootfs. GC would eventually take care of the old vfsmount.

1435

* Actually it makes sense, especially if rootfs would contain a

1435

* Actually it makes sense, especially if rootfs would contain a

1436

* /reboot - static binary that would close all descriptors and

1436

* /reboot - static binary that would close all descriptors and

1437

* call reboot(9). Then init(8) could umount root and exec /reboot.

1437

* call reboot(9). Then init(8) could umount root and exec /reboot.

1438

*/

1438

*/

1439

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1439

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1440

/*

1440

/*

1441

* Special case for "unmounting" root ...

1441

* Special case for "unmounting" root ...

1442

* we just try to remount it readonly.

1442

* we just try to remount it readonly.

1443

*/

1443

*/

1444

if (!capable(CAP_SYS_ADMIN))

1444

if (!capable(CAP_SYS_ADMIN))

1445

return -EPERM;

1445

return -EPERM;

1446

down_write(&sb->s_umount);

1446

down_write(&sb->s_umount);

1447

if (!(sb->s_flags & MS_RDONLY))

1447

if (!(sb->s_flags & MS_RDONLY))

1448

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1448

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1449

up_write(&sb->s_umount);

1449

up_write(&sb->s_umount);

1450

return retval;

1450

return retval;

1451

}

1451

}

1452

1453

namespace_lock();

1453

namespace_lock();

1454

lock_mount_hash();

1454

lock_mount_hash();

1455

event++;

1455

event++;

1456

1457

if (flags & MNT_DETACH) {

1457

if (flags & MNT_DETACH) {

1458

if (!list_empty(&mnt->mnt_list))

1458

if (!list_empty(&mnt->mnt_list))

1459

umount_tree(mnt, 2);

1459

umount_tree(mnt, 2);

1460

retval = 0;

1460

retval = 0;

1461

} else {

1461

} else {

1462

shrink_submounts(mnt);

1462

shrink_submounts(mnt);

1463

retval = -EBUSY;

1463

retval = -EBUSY;

1464

if (!propagate_mount_busy(mnt, 2)) {

1464

if (!propagate_mount_busy(mnt, 2)) {

1465

if (!list_empty(&mnt->mnt_list))

1465

if (!list_empty(&mnt->mnt_list))

1466

umount_tree(mnt, 1);

1466

umount_tree(mnt, 1);

1467

retval = 0;

1467

retval = 0;

1468

}

1468

}

1469

}

1469

}

1470

unlock_mount_hash();

1470

unlock_mount_hash();

1471

namespace_unlock();

1471

namespace_unlock();

1472

return retval;

1472

return retval;

1473

}

1473

}

1474

1475

/*

1475

/*

1476

* __detach_mounts - lazily unmount all mounts on the specified dentry

1476

* __detach_mounts - lazily unmount all mounts on the specified dentry

1477

*

1477

*

1478

* During unlink, rmdir, and d_drop it is possible to loose the path

1478

* During unlink, rmdir, and d_drop it is possible to loose the path

1479

* to an existing mountpoint, and wind up leaking the mount.

1479

* to an existing mountpoint, and wind up leaking the mount.

1480

* detach_mounts allows lazily unmounting those mounts instead of

1480

* detach_mounts allows lazily unmounting those mounts instead of

1481

* leaking them.

1481

* leaking them.

1482

*

1482

*

1483

* The caller may hold dentry->d_inode->i_mutex.

1483

* The caller may hold dentry->d_inode->i_mutex.

1484

*/

1484

*/

1485

void __detach_mounts(struct dentry *dentry)

1485

void __detach_mounts(struct dentry *dentry)

1486

{

1486

{

1487

struct mountpoint *mp;

1487

struct mountpoint *mp;

1488

struct mount *mnt;

1488

struct mount *mnt;

1489

1490

namespace_lock();

1490

namespace_lock();

1491

mp = lookup_mountpoint(dentry);

1491

mp = lookup_mountpoint(dentry);

1492

if (!mp)

1492

if (!mp)

1493

goto out_unlock;

1493

goto out_unlock;

1494

1495

lock_mount_hash();

1495

lock_mount_hash();

1496

while (!hlist_empty(&mp->m_list)) {

1496

while (!hlist_empty(&mp->m_list)) {

1497

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1497

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1498

umount_tree(mnt, 2);

1498

umount_tree(mnt, 2);

1499

}

1499

}

1500

unlock_mount_hash();

1500

unlock_mount_hash();

1501

put_mountpoint(mp);

1501

put_mountpoint(mp);

1502

out_unlock:

1502

out_unlock:

1503

namespace_unlock();

1503

namespace_unlock();

1504

}

1504

}

1505

1506

/*

1506

/*

1507

* Is the caller allowed to modify his namespace?

1507

* Is the caller allowed to modify his namespace?

1508

*/

1508

*/

1509

static inline bool may_mount(void)

1509

static inline bool may_mount(void)

1510

{

1510

{

1511

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1511

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1512

}

1512

}

1513

1514

/*

1514

/*

1515

* Now umount can handle mount points as well as block devices.

1515

* Now umount can handle mount points as well as block devices.

1516

* This is important for filesystems which use unnamed block devices.

1516

* This is important for filesystems which use unnamed block devices.

1517

*

1517

*

1518

* We now support a flag for forced unmount like the other 'big iron'

1518

* We now support a flag for forced unmount like the other 'big iron'

1519

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1519

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1520

*/

1520

*/

1521

1522

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1522

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1523

{

1523

{

1524

struct path path;

1524

struct path path;

1525

struct mount *mnt;

1525

struct mount *mnt;

1526

int retval;

1526

int retval;

1527

int lookup_flags = 0;

1527

int lookup_flags = 0;

1528

1529

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1529

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1530

return -EINVAL;

1530

return -EINVAL;

1531

1532

if (!may_mount())

1532

if (!may_mount())

1533

return -EPERM;

1533

return -EPERM;

1534

1535

if (!(flags & UMOUNT_NOFOLLOW))

1535

if (!(flags & UMOUNT_NOFOLLOW))

1536

lookup_flags |= LOOKUP_FOLLOW;

1536

lookup_flags |= LOOKUP_FOLLOW;

1537

1538

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1538

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1539

if (retval)

1539

if (retval)

1540

goto out;

1540

goto out;

1541

mnt = real_mount(path.mnt);

1541

mnt = real_mount(path.mnt);

1542

retval = -EINVAL;

1542

retval = -EINVAL;

1543

if (path.dentry != path.mnt->mnt_root)

1543

if (path.dentry != path.mnt->mnt_root)

1544

goto dput_and_out;

1544

goto dput_and_out;

1545

if (!check_mnt(mnt))

1545

if (!check_mnt(mnt))

1546

goto dput_and_out;

1546

goto dput_and_out;

1547

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1547

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1548

goto dput_and_out;

1548

goto dput_and_out;

1549

1550

retval = do_umount(mnt, flags);

1550

retval = do_umount(mnt, flags);

1551

dput_and_out:

1551

dput_and_out:

1552

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1552

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1553

dput(path.dentry);

1553

dput(path.dentry);

1554

mntput_no_expire(mnt);

1554

mntput_no_expire(mnt);

1555

out:

1555

out:

1556

return retval;

1556

return retval;

1557

}

1557

}

1558

1559

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1559

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1560

1561

/*

1561

/*

1562

* The 2.0 compatible umount. No flags.

1562

* The 2.0 compatible umount. No flags.

1563

*/

1563

*/

1564

SYSCALL_DEFINE1(oldumount, char __user *, name)

1564

SYSCALL_DEFINE1(oldumount, char __user *, name)

1565

{

1565

{

1566

return sys_umount(name, 0);

1566

return sys_umount(name, 0);

1567

}

1567

}

1568

1569

#endif

1569

#endif

1570

1571

static bool is_mnt_ns_file(struct dentry *dentry)

1571

static bool is_mnt_ns_file(struct dentry *dentry)

1572

{

1572

{

1573

/* Is this a proxy for a mount namespace? */

1573

/* Is this a proxy for a mount namespace? */

1574

struct inode *inode = dentry->d_inode;

1574

struct inode *inode = dentry->d_inode;

1575

struct proc_ns *ei;

1575

struct proc_ns *ei;

1576

1577

if (!proc_ns_inode(inode))

1577

if (!proc_ns_inode(inode))

1578

return false;

1578

return false;

1579

1580

ei = get_proc_ns(inode);

1580

ei = get_proc_ns(inode);

1581

if (ei->ns_ops != &mntns_operations)

1581

if (ei->ns_ops != &mntns_operations)

1582

return false;

1582

return false;

1583

1584

return true;

1584

return true;

1585

}

1585

}

1586

1587

static bool mnt_ns_loop(struct dentry *dentry)

1587

static bool mnt_ns_loop(struct dentry *dentry)

1588

{

1588

{

1589

/* Could bind mounting the mount namespace inode cause a

1589

/* Could bind mounting the mount namespace inode cause a

1590

* mount namespace loop?

1590

* mount namespace loop?

1591

*/

1591

*/

1592

struct mnt_namespace *mnt_ns;

1592

struct mnt_namespace *mnt_ns;

1593

if (!is_mnt_ns_file(dentry))

1593

if (!is_mnt_ns_file(dentry))

1594

return false;

1594

return false;

1595

1596

mnt_ns = get_proc_ns(dentry->d_inode)->ns;

1596

mnt_ns = get_proc_ns(dentry->d_inode)->ns;

1597

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1597

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1598

}

1598

}

1599

1600

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1600

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1601

int flag)

1601

int flag)

1602

{

1602

{

1603

struct mount *res, *p, *q, *r, *parent;

1603

struct mount *res, *p, *q, *r, *parent;

1604

1605

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1605

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1606

return ERR_PTR(-EINVAL);

1606

return ERR_PTR(-EINVAL);

1607

1608

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1608

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1609

return ERR_PTR(-EINVAL);

1609

return ERR_PTR(-EINVAL);

1610

1611

res = q = clone_mnt(mnt, dentry, flag);

1611

res = q = clone_mnt(mnt, dentry, flag);

1612

if (IS_ERR(q))

1612

if (IS_ERR(q))

1613

return q;

1613

return q;

1614

1615

q->mnt.mnt_flags &= ~MNT_LOCKED;

1615

q->mnt.mnt_flags &= ~MNT_LOCKED;

1616

q->mnt_mountpoint = mnt->mnt_mountpoint;

1616

q->mnt_mountpoint = mnt->mnt_mountpoint;

1617

1618

p = mnt;

1618

p = mnt;

1619

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1619

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1620

struct mount *s;

1620

struct mount *s;

1621

if (!is_subdir(r->mnt_mountpoint, dentry))

1621

if (!is_subdir(r->mnt_mountpoint, dentry))

1622

continue;

1622

continue;

1623

1624

for (s = r; s; s = next_mnt(s, r)) {

1624

for (s = r; s; s = next_mnt(s, r)) {

1625

struct mount *t = NULL;

1625

struct mount *t = NULL;

1626

if (!(flag & CL_COPY_UNBINDABLE) &&

1626

if (!(flag & CL_COPY_UNBINDABLE) &&

1627

IS_MNT_UNBINDABLE(s)) {

1627

IS_MNT_UNBINDABLE(s)) {

1628

s = skip_mnt_tree(s);

1628

s = skip_mnt_tree(s);

1629

continue;

1629

continue;

1630

}

1630

}

1631

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1631

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1632

is_mnt_ns_file(s->mnt.mnt_root)) {

1632

is_mnt_ns_file(s->mnt.mnt_root)) {

1633

s = skip_mnt_tree(s);

1633

s = skip_mnt_tree(s);

1634

continue;

1634

continue;

1635

}

1635

}

1636

while (p != s->mnt_parent) {

1636

while (p != s->mnt_parent) {

1637

p = p->mnt_parent;

1637

p = p->mnt_parent;

1638

q = q->mnt_parent;

1638

q = q->mnt_parent;

1639

}

1639

}

1640

p = s;

1640

p = s;

1641

parent = q;

1641

parent = q;

1642

q = clone_mnt(p, p->mnt.mnt_root, flag);

1642

q = clone_mnt(p, p->mnt.mnt_root, flag);

1643

if (IS_ERR(q))

1643

if (IS_ERR(q))

1644

goto out;

1644

goto out;

1645

lock_mount_hash();

1645

lock_mount_hash();

1646

list_add_tail(&q->mnt_list, &res->mnt_list);

1646

list_add_tail(&q->mnt_list, &res->mnt_list);

1647

mnt_set_mountpoint(parent, p->mnt_mp, q);

1647

mnt_set_mountpoint(parent, p->mnt_mp, q);

1648

if (!list_empty(&parent->mnt_mounts)) {

1648

if (!list_empty(&parent->mnt_mounts)) {

1649

t = list_last_entry(&parent->mnt_mounts,

1649

t = list_last_entry(&parent->mnt_mounts,

1650

struct mount, mnt_child);

1650

struct mount, mnt_child);

1651

if (t->mnt_mp != p->mnt_mp)

1651

if (t->mnt_mp != p->mnt_mp)

1652

t = NULL;

1652

t = NULL;

1653

}

1653

}

1654

attach_shadowed(q, parent, t);

1654

attach_shadowed(q, parent, t);

1655

unlock_mount_hash();

1655

unlock_mount_hash();

1656

}

1656

}

1657

}

1657

}

1658

return res;

1658

return res;

1659

out:

1659

out:

1660

if (res) {

1660

if (res) {

1661

lock_mount_hash();

1661

lock_mount_hash();

1662

umount_tree(res, 0);

1662

umount_tree(res, 0);

1663

unlock_mount_hash();

1663

unlock_mount_hash();

1664

}

1664

}

1665

return q;

1665

return q;

1666

}

1666

}

1667

1668

/* Caller should check returned pointer for errors */

1668

/* Caller should check returned pointer for errors */

1669

1670

struct vfsmount *collect_mounts(struct path *path)

1670

struct vfsmount *collect_mounts(struct path *path)

1671

{

1671

{

1672

struct mount *tree;

1672

struct mount *tree;

1673

namespace_lock();

1673

namespace_lock();

1674

tree = copy_tree(real_mount(path->mnt), path->dentry,

1674

tree = copy_tree(real_mount(path->mnt), path->dentry,

1675

CL_COPY_ALL | CL_PRIVATE);

1675

CL_COPY_ALL | CL_PRIVATE);

1676

namespace_unlock();

1676

namespace_unlock();

1677

if (IS_ERR(tree))

1677

if (IS_ERR(tree))

1678

return ERR_CAST(tree);

1678

return ERR_CAST(tree);

1679

return &tree->mnt;

1679

return &tree->mnt;

1680

}

1680

}

1681

1682

void drop_collected_mounts(struct vfsmount *mnt)

1682

void drop_collected_mounts(struct vfsmount *mnt)

1683

{

1683

{

1684

namespace_lock();

1684

namespace_lock();

1685

lock_mount_hash();

1685

lock_mount_hash();

1686

umount_tree(real_mount(mnt), 0);

1686

umount_tree(real_mount(mnt), 0);

1687

unlock_mount_hash();

1687

unlock_mount_hash();

1688

namespace_unlock();

1688

namespace_unlock();

1689

}

1689

}

1690

1691

/**

1691

/**

1692

* clone_private_mount - create a private clone of a path

1692

* clone_private_mount - create a private clone of a path

1693

*

1693

*

1694

* This creates a new vfsmount, which will be the clone of @path. The new will

1694

* This creates a new vfsmount, which will be the clone of @path. The new will

1695

* not be attached anywhere in the namespace and will be private (i.e. changes

1695

* not be attached anywhere in the namespace and will be private (i.e. changes

1696

* to the originating mount won't be propagated into this).

1696

* to the originating mount won't be propagated into this).

1697

*

1697

*

1698

* Release with mntput().

1698

* Release with mntput().

1699

*/

1699

*/

1700

struct vfsmount *clone_private_mount(struct path *path)

1700

struct vfsmount *clone_private_mount(struct path *path)

1701

{

1701

{

1702

struct mount *old_mnt = real_mount(path->mnt);

1702

struct mount *old_mnt = real_mount(path->mnt);

1703

struct mount *new_mnt;

1703

struct mount *new_mnt;

1704

1705

if (IS_MNT_UNBINDABLE(old_mnt))

1705

if (IS_MNT_UNBINDABLE(old_mnt))

1706

return ERR_PTR(-EINVAL);

1706

return ERR_PTR(-EINVAL);

1707

1708

down_read(&namespace_sem);

1708

down_read(&namespace_sem);

1709

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1709

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1710

up_read(&namespace_sem);

1710

up_read(&namespace_sem);

1711

if (IS_ERR(new_mnt))

1711

if (IS_ERR(new_mnt))

1712

return ERR_CAST(new_mnt);

1712

return ERR_CAST(new_mnt);

1713

1714

return &new_mnt->mnt;

1714

return &new_mnt->mnt;

1715

}

1715

}

1716

EXPORT_SYMBOL_GPL(clone_private_mount);

1716

EXPORT_SYMBOL_GPL(clone_private_mount);

1717

1718

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1718

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1719

struct vfsmount *root)

1719

struct vfsmount *root)

1720

{

1720

{

1721

struct mount *mnt;

1721

struct mount *mnt;

1722

int res = f(root, arg);

1722

int res = f(root, arg);

1723

if (res)

1723

if (res)

1724

return res;

1724

return res;

1725

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1725

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1726

res = f(&mnt->mnt, arg);

1726

res = f(&mnt->mnt, arg);

1727

if (res)

1727

if (res)

1728

return res;

1728

return res;

1729

}

1729

}

1730

return 0;

1730

return 0;

1731

}

1731

}

1732

1733

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1733

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1734

{

1734

{

1735

struct mount *p;

1735

struct mount *p;

1736

1737

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1737

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1738

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1738

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1739

mnt_release_group_id(p);

1739

mnt_release_group_id(p);

1740

}

1740

}

1741

}

1741

}

1742

1743

static int invent_group_ids(struct mount *mnt, bool recurse)

1743

static int invent_group_ids(struct mount *mnt, bool recurse)

1744

{

1744

{

1745

struct mount *p;

1745

struct mount *p;

1746

1747

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1747

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1748

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1748

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1749

int err = mnt_alloc_group_id(p);

1749

int err = mnt_alloc_group_id(p);

1750

if (err) {

1750

if (err) {

1751

cleanup_group_ids(mnt, p);

1751

cleanup_group_ids(mnt, p);

1752

return err;

1752

return err;

1753

}

1753

}

1754

}

1754

}

1755

}

1755

}

1756

1757

return 0;

1757

return 0;

1758

}

1758

}

1759

1760

/*

1760

/*

1761

* @source_mnt : mount tree to be attached

1761

* @source_mnt : mount tree to be attached

1762

* @nd : place the mount tree @source_mnt is attached

1762

* @nd : place the mount tree @source_mnt is attached

1763

* @parent_nd : if non-null, detach the source_mnt from its parent and

1763

* @parent_nd : if non-null, detach the source_mnt from its parent and

1764

* store the parent mount and mountpoint dentry.

1764

* store the parent mount and mountpoint dentry.

1765

* (done when source_mnt is moved)

1765

* (done when source_mnt is moved)

1766

*

1766

*

1767

* NOTE: in the table below explains the semantics when a source mount

1767

* NOTE: in the table below explains the semantics when a source mount

1768

* of a given type is attached to a destination mount of a given type.

1768

* of a given type is attached to a destination mount of a given type.

1769

* ---------------------------------------------------------------------------

1769

* ---------------------------------------------------------------------------

1770

* | BIND MOUNT OPERATION |

1770

* | BIND MOUNT OPERATION |

1771

* |**************************************************************************

1771

* |**************************************************************************

1772

1772

1773

* | dest | | | | |

1773

* | dest | | | | |

1774

* | | | | | | |

1774

* | | | | | | |

1775

* | v | | | | |

1775

* | v | | | | |

1776

* |**************************************************************************

1776

* |**************************************************************************

1777

1777

1778

* | | | | | |

1778

* | | | | | |

1779

1779

1780

* ***************************************************************************

1780

* ***************************************************************************

1781

* A bind operation clones the source mount and mounts the clone on the

1781

* A bind operation clones the source mount and mounts the clone on the

1782

* destination mount.

1782

* destination mount.

1783

*

1783

*

1784

* (++) the cloned mount is propagated to all the mounts in the propagation

1784

* (++) the cloned mount is propagated to all the mounts in the propagation

1785

* tree of the destination mount and the cloned mount is added to

1785

* tree of the destination mount and the cloned mount is added to

1786

* the peer group of the source mount.

1786

* the peer group of the source mount.

1787

* (+) the cloned mount is created under the destination mount and is marked

1787

* (+) the cloned mount is created under the destination mount and is marked

1788

* as shared. The cloned mount is added to the peer group of the source

1788

* as shared. The cloned mount is added to the peer group of the source

1789

* mount.

1789

* mount.

1790

* (+++) the mount is propagated to all the mounts in the propagation tree

1790

* (+++) the mount is propagated to all the mounts in the propagation tree

1791

* of the destination mount and the cloned mount is made slave

1791

* of the destination mount and the cloned mount is made slave

1792

* of the same master as that of the source mount. The cloned mount

1792

* of the same master as that of the source mount. The cloned mount

1793

* is marked as 'shared and slave'.

1793

* is marked as 'shared and slave'.

1794

* (*) the cloned mount is made a slave of the same master as that of the

1794

* (*) the cloned mount is made a slave of the same master as that of the

1795

* source mount.

1795

* source mount.

1796

*

1796

*

1797

* ---------------------------------------------------------------------------

1797

* ---------------------------------------------------------------------------

1798

* | MOVE MOUNT OPERATION |

1798

* | MOVE MOUNT OPERATION |

1799

* |**************************************************************************

1799

* |**************************************************************************

1800

1800

1801

* | dest | | | | |

1801

* | dest | | | | |

1802

* | | | | | | |

1802

* | | | | | | |

1803

* | v | | | | |

1803

* | v | | | | |

1804

* |**************************************************************************

1804

* |**************************************************************************

1805

1805

1806

* | | | | | |

1806

* | | | | | |

1807

1807

1808

* ***************************************************************************

1808

* ***************************************************************************

1809

*

1809

*

1810

* (+) the mount is moved to the destination. And is then propagated to

1810

* (+) the mount is moved to the destination. And is then propagated to

1811

* all the mounts in the propagation tree of the destination mount.

1811

* all the mounts in the propagation tree of the destination mount.

1812

* (+*) the mount is moved to the destination.

1812

* (+*) the mount is moved to the destination.

1813

* (+++) the mount is moved to the destination and is then propagated to

1813

* (+++) the mount is moved to the destination and is then propagated to

1814

* all the mounts belonging to the destination mount's propagation tree.

1814

* all the mounts belonging to the destination mount's propagation tree.

1815

* the mount is marked as 'shared and slave'.

1815

* the mount is marked as 'shared and slave'.

1816

* (*) the mount continues to be a slave at the new location.

1816

* (*) the mount continues to be a slave at the new location.

1817

*

1817

*

1818

* if the source mount is a tree, the operations explained above is

1818

* if the source mount is a tree, the operations explained above is

1819

* applied to each mount in the tree.

1819

* applied to each mount in the tree.

1820

* Must be called without spinlocks held, since this function can sleep

1820

* Must be called without spinlocks held, since this function can sleep

1821

* in allocations.

1821

* in allocations.

1822

*/

1822

*/

1823

static int attach_recursive_mnt(struct mount *source_mnt,

1823

static int attach_recursive_mnt(struct mount *source_mnt,

1824

struct mount *dest_mnt,

1824

struct mount *dest_mnt,

1825

struct mountpoint *dest_mp,

1825

struct mountpoint *dest_mp,

1826

struct path *parent_path)

1826

struct path *parent_path)

1827

{

1827

{

1828

HLIST_HEAD(tree_list);

1828

HLIST_HEAD(tree_list);

1829

struct mount *child, *p;

1829

struct mount *child, *p;

1830

struct hlist_node *n;

1830

struct hlist_node *n;

1831

int err;

1831

int err;

1832

1833

if (IS_MNT_SHARED(dest_mnt)) {

1833

if (IS_MNT_SHARED(dest_mnt)) {

1834

err = invent_group_ids(source_mnt, true);

1834

err = invent_group_ids(source_mnt, true);

1835

if (err)

1835

if (err)

1836

goto out;

1836

goto out;

1837

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1837

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1838

lock_mount_hash();

1838

lock_mount_hash();

1839

if (err)

1839

if (err)

1840

goto out_cleanup_ids;

1840

goto out_cleanup_ids;

1841

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1841

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1842

set_mnt_shared(p);

1842

set_mnt_shared(p);

1843

} else {

1843

} else {

1844

lock_mount_hash();

1844

lock_mount_hash();

1845

}

1845

}

1846

if (parent_path) {

1846

if (parent_path) {

1847

detach_mnt(source_mnt, parent_path);

1847

detach_mnt(source_mnt, parent_path);

1848

attach_mnt(source_mnt, dest_mnt, dest_mp);

1848

attach_mnt(source_mnt, dest_mnt, dest_mp);

1849

touch_mnt_namespace(source_mnt->mnt_ns);

1849

touch_mnt_namespace(source_mnt->mnt_ns);

1850

} else {

1850

} else {

1851

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1851

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1852

commit_tree(source_mnt, NULL);

1852

commit_tree(source_mnt, NULL);

1853

}

1853

}

1854

1855

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1855

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1856

struct mount *q;

1856

struct mount *q;

1857

hlist_del_init(&child->mnt_hash);

1857

hlist_del_init(&child->mnt_hash);

1858

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1858

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1859

child->mnt_mountpoint);

1859

child->mnt_mountpoint);

1860

commit_tree(child, q);

1860

commit_tree(child, q);

1861

}

1861

}

1862

unlock_mount_hash();

1862

unlock_mount_hash();

1863

1864

return 0;

1864

return 0;

1865

1866

out_cleanup_ids:

1866

out_cleanup_ids:

1867

while (!hlist_empty(&tree_list)) {

1867

while (!hlist_empty(&tree_list)) {

1868

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1868

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1869

umount_tree(child, 0);

1869

umount_tree(child, 0);

1870

}

1870

}

1871

unlock_mount_hash();

1871

unlock_mount_hash();

1872

cleanup_group_ids(source_mnt, NULL);

1872

cleanup_group_ids(source_mnt, NULL);

1873

out:

1873

out:

1874

return err;

1874

return err;

1875

}

1875

}

1876

1877

static struct mountpoint *lock_mount(struct path *path)

1877

static struct mountpoint *lock_mount(struct path *path)

1878

{

1878

{

1879

struct vfsmount *mnt;

1879

struct vfsmount *mnt;

1880

struct dentry *dentry = path->dentry;

1880

struct dentry *dentry = path->dentry;

1881

retry:

1881

retry:

1882

mutex_lock(&dentry->d_inode->i_mutex);

1882

mutex_lock(&dentry->d_inode->i_mutex);

1883

if (unlikely(cant_mount(dentry))) {

1883

if (unlikely(cant_mount(dentry))) {

1884

mutex_unlock(&dentry->d_inode->i_mutex);

1884

mutex_unlock(&dentry->d_inode->i_mutex);

1885

return ERR_PTR(-ENOENT);

1885

return ERR_PTR(-ENOENT);

1886

}

1886

}

1887

namespace_lock();

1887

namespace_lock();

1888

mnt = lookup_mnt(path);

1888

mnt = lookup_mnt(path);

1889

if (likely(!mnt)) {

1889

if (likely(!mnt)) {

1890

struct mountpoint *mp = lookup_mountpoint(dentry);

1890

struct mountpoint *mp = lookup_mountpoint(dentry);

1891

if (!mp)

1891

if (!mp)

1892

mp = new_mountpoint(dentry);

1892

mp = new_mountpoint(dentry);

1893

if (IS_ERR(mp)) {

1893

if (IS_ERR(mp)) {

1894

namespace_unlock();

1894

namespace_unlock();

1895

mutex_unlock(&dentry->d_inode->i_mutex);

1895

mutex_unlock(&dentry->d_inode->i_mutex);

1896

return mp;

1896

return mp;

1897

}

1897

}

1898

return mp;

1898

return mp;

1899

}

1899

}

1900

namespace_unlock();

1900

namespace_unlock();

1901

mutex_unlock(&path->dentry->d_inode->i_mutex);

1901

mutex_unlock(&path->dentry->d_inode->i_mutex);

1902

path_put(path);

1902

path_put(path);

1903

path->mnt = mnt;

1903

path->mnt = mnt;

1904

dentry = path->dentry = dget(mnt->mnt_root);

1904

dentry = path->dentry = dget(mnt->mnt_root);

1905

goto retry;

1905

goto retry;

1906

}

1906

}

1907

1908

static void unlock_mount(struct mountpoint *where)

1908

static void unlock_mount(struct mountpoint *where)

1909

{

1909

{

1910

struct dentry *dentry = where->m_dentry;

1910

struct dentry *dentry = where->m_dentry;

1911

put_mountpoint(where);

1911

put_mountpoint(where);

1912

namespace_unlock();

1912

namespace_unlock();

1913

mutex_unlock(&dentry->d_inode->i_mutex);

1913

mutex_unlock(&dentry->d_inode->i_mutex);

1914

}

1914

}

1915

1916

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1916

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1917

{

1917

{

1918

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1918

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1919

return -EINVAL;

1919

return -EINVAL;

1920

1921

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1921

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1922

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1922

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1923

return -ENOTDIR;

1923

return -ENOTDIR;

1924

1925

return attach_recursive_mnt(mnt, p, mp, NULL);

1925

return attach_recursive_mnt(mnt, p, mp, NULL);

1926

}

1926

}

1927

1928

/*

1928

/*

1929

* Sanity check the flags to change_mnt_propagation.

1929

* Sanity check the flags to change_mnt_propagation.

1930

*/

1930

*/

1931

1932

static int flags_to_propagation_type(int flags)

1932

static int flags_to_propagation_type(int flags)

1933

{

1933

{

1934

int type = flags & ~(MS_REC | MS_SILENT);

1934

int type = flags & ~(MS_REC | MS_SILENT);

1935

1936

/* Fail if any non-propagation flags are set */

1936

/* Fail if any non-propagation flags are set */

1937

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1937

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1938

return 0;

1938

return 0;

1939

/* Only one propagation flag should be set */

1939

/* Only one propagation flag should be set */

1940

if (!is_power_of_2(type))

1940

if (!is_power_of_2(type))

1941

return 0;

1941

return 0;

1942

return type;

1942

return type;

1943

}

1943

}

1944

1945

/*

1945

/*

1946

* recursively change the type of the mountpoint.

1946

* recursively change the type of the mountpoint.

1947

*/

1947

*/

1948

static int do_change_type(struct path *path, int flag)

1948

static int do_change_type(struct path *path, int flag)

1949

{

1949

{

1950

struct mount *m;

1950

struct mount *m;

1951

struct mount *mnt = real_mount(path->mnt);

1951

struct mount *mnt = real_mount(path->mnt);

1952

int recurse = flag & MS_REC;

1952

int recurse = flag & MS_REC;

1953

int type;

1953

int type;

1954

int err = 0;

1954

int err = 0;

1955

1956

if (path->dentry != path->mnt->mnt_root)

1956

if (path->dentry != path->mnt->mnt_root)

1957

return -EINVAL;

1957

return -EINVAL;

1958

1959

type = flags_to_propagation_type(flag);

1959

type = flags_to_propagation_type(flag);

1960

if (!type)

1960

if (!type)

1961

return -EINVAL;

1961

return -EINVAL;

1962

1963

namespace_lock();

1963

namespace_lock();

1964

if (type == MS_SHARED) {

1964

if (type == MS_SHARED) {

1965

err = invent_group_ids(mnt, recurse);

1965

err = invent_group_ids(mnt, recurse);

1966

if (err)

1966

if (err)

1967

goto out_unlock;

1967

goto out_unlock;

1968

}

1968

}

1969

1970

lock_mount_hash();

1970

lock_mount_hash();

1971

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1971

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1972

change_mnt_propagation(m, type);

1972

change_mnt_propagation(m, type);

1973

unlock_mount_hash();

1973

unlock_mount_hash();

1974

1975

out_unlock:

1975

out_unlock:

1976

namespace_unlock();

1976

namespace_unlock();

1977

return err;

1977

return err;

1978

}

1978

}

1979

1980

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1980

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1981

{

1981

{

1982

struct mount *child;

1982

struct mount *child;

1983

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1983

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1984

if (!is_subdir(child->mnt_mountpoint, dentry))

1984

if (!is_subdir(child->mnt_mountpoint, dentry))

1985

continue;

1985

continue;

1986

1987

if (child->mnt.mnt_flags & MNT_LOCKED)

1987

if (child->mnt.mnt_flags & MNT_LOCKED)

1988

return true;

1988

return true;

1989

}

1989

}

1990

return false;

1990

return false;

1991

}

1991

}

1992

1993

/*

1993

/*

1994

* do loopback mount.

1994

* do loopback mount.

1995

*/

1995

*/

1996

static int do_loopback(struct path *path, const char *old_name,

1996

static int do_loopback(struct path *path, const char *old_name,

1997

int recurse)

1997

int recurse)

1998

{

1998

{

1999

struct path old_path;

1999

struct path old_path;

2000

struct mount *mnt = NULL, *old, *parent;

2000

struct mount *mnt = NULL, *old, *parent;

2001

struct mountpoint *mp;

2001

struct mountpoint *mp;

2002

int err;

2002

int err;

2003

if (!old_name || !*old_name)

2003

if (!old_name || !*old_name)

2004

return -EINVAL;

2004

return -EINVAL;

2005

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

2005

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

2006

if (err)

2006

if (err)

2007

return err;

2007

return err;

2008

2009

err = -EINVAL;

2009

err = -EINVAL;

2010

if (mnt_ns_loop(old_path.dentry))

2010

if (mnt_ns_loop(old_path.dentry))

2011

goto out;

2011

goto out;

2012

2013

mp = lock_mount(path);

2013

mp = lock_mount(path);

2014

err = PTR_ERR(mp);

2014

err = PTR_ERR(mp);

2015

if (IS_ERR(mp))

2015

if (IS_ERR(mp))

2016

goto out;

2016

goto out;

2017

2018

old = real_mount(old_path.mnt);

2018

old = real_mount(old_path.mnt);

2019

parent = real_mount(path->mnt);

2019

parent = real_mount(path->mnt);

2020

2021

err = -EINVAL;

2021

err = -EINVAL;

2022

if (IS_MNT_UNBINDABLE(old))

2022

if (IS_MNT_UNBINDABLE(old))

2023

goto out2;

2023

goto out2;

2024

2025

if (!check_mnt(parent) || !check_mnt(old))

2025

if (!check_mnt(parent) || !check_mnt(old))

2026

goto out2;

2026

goto out2;

2027

2028

if (!recurse && has_locked_children(old, old_path.dentry))

2028

if (!recurse && has_locked_children(old, old_path.dentry))

2029

goto out2;

2029

goto out2;

2030

2031

if (recurse)

2031

if (recurse)

2032

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2032

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2033

else

2033

else

2034

mnt = clone_mnt(old, old_path.dentry, 0);

2034

mnt = clone_mnt(old, old_path.dentry, 0);

2035

2036

if (IS_ERR(mnt)) {

2036

if (IS_ERR(mnt)) {

2037

err = PTR_ERR(mnt);

2037

err = PTR_ERR(mnt);

2038

goto out2;

2038

goto out2;

2039

}

2039

}

2040

2041

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2041

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2042

2043

err = graft_tree(mnt, parent, mp);

2043

err = graft_tree(mnt, parent, mp);

2044

if (err) {

2044

if (err) {

2045

lock_mount_hash();

2045

lock_mount_hash();

2046

umount_tree(mnt, 0);

2046

umount_tree(mnt, 0);

2047

unlock_mount_hash();

2047

unlock_mount_hash();

2048

}

2048

}

2049

out2:

2049

out2:

2050

unlock_mount(mp);

2050

unlock_mount(mp);

2051

out:

2051

out:

2052

path_put(&old_path);

2052

path_put(&old_path);

2053

return err;

2053

return err;

2054

}

2054

}

2055

2056

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2056

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2057

{

2057

{

2058

int error = 0;

2058

int error = 0;

2059

int readonly_request = 0;

2059

int readonly_request = 0;

2060

2061

if (ms_flags & MS_RDONLY)

2061

if (ms_flags & MS_RDONLY)

2062

readonly_request = 1;

2062

readonly_request = 1;

2063

if (readonly_request == __mnt_is_readonly(mnt))

2063

if (readonly_request == __mnt_is_readonly(mnt))

2064

return 0;

2064

return 0;

2065

2066

if (readonly_request)

2066

if (readonly_request)

2067

error = mnt_make_readonly(real_mount(mnt));

2067

error = mnt_make_readonly(real_mount(mnt));

2068

else

2068

else

2069

__mnt_unmake_readonly(real_mount(mnt));

2069

__mnt_unmake_readonly(real_mount(mnt));

2070

return error;

2070

return error;

2071

}

2071

}

2072

2073

/*

2073

/*

2074

* change filesystem flags. dir should be a physical root of filesystem.

2074

* change filesystem flags. dir should be a physical root of filesystem.

2075

* If you've mounted a non-root directory somewhere and want to do remount

2075

* If you've mounted a non-root directory somewhere and want to do remount

2076

* on it - tough luck.

2076

* on it - tough luck.

2077

*/

2077

*/

2078

static int do_remount(struct path *path, int flags, int mnt_flags,

2078

static int do_remount(struct path *path, int flags, int mnt_flags,

2079

void *data)

2079

void *data)

2080

{

2080

{

2081

int err;

2081

int err;

2082

struct super_block *sb = path->mnt->mnt_sb;

2082

struct super_block *sb = path->mnt->mnt_sb;

2083

struct mount *mnt = real_mount(path->mnt);

2083

struct mount *mnt = real_mount(path->mnt);

2084

2085

if (!check_mnt(mnt))

2085

if (!check_mnt(mnt))

2086

return -EINVAL;

2086

return -EINVAL;

2087

2088

if (path->dentry != path->mnt->mnt_root)

2088

if (path->dentry != path->mnt->mnt_root)

2089

return -EINVAL;

2089

return -EINVAL;

2090

2091

/* Don't allow changing of locked mnt flags.

2091

/* Don't allow changing of locked mnt flags.

2092

*

2092

*

2093

* No locks need to be held here while testing the various

2093

* No locks need to be held here while testing the various

2094

* MNT_LOCK flags because those flags can never be cleared

2094

* MNT_LOCK flags because those flags can never be cleared

2095

* once they are set.

2095

* once they are set.

2096

*/

2096

*/

2097

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2097

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2098

!(mnt_flags & MNT_READONLY)) {

2098

!(mnt_flags & MNT_READONLY)) {

2099

return -EPERM;

2099

return -EPERM;

2100

}

2100

}

2101

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2101

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2102

!(mnt_flags & MNT_NODEV)) {

2102

!(mnt_flags & MNT_NODEV)) {

2103

return -EPERM;

2103

/* Was the nodev implicitly added in mount? */

2104

if ((mnt->mnt_ns->user_ns != &init_user_ns) &&

2105

!(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2106

mnt_flags |= MNT_NODEV;

2107

} else {

2108

return -EPERM;

2109

}

2104

}

2110

}

2105

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2111

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2106

!(mnt_flags & MNT_NOSUID)) {

2112

!(mnt_flags & MNT_NOSUID)) {

2107

return -EPERM;

2113

return -EPERM;

2108

}

2114

}

2109

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2115

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2110

!(mnt_flags & MNT_NOEXEC)) {

2116

!(mnt_flags & MNT_NOEXEC)) {

2111

return -EPERM;

2117

return -EPERM;

2112

}

2118

}

2113

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2119

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2114

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2120

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2115

return -EPERM;

2121

return -EPERM;

2116

}

2122

}

2117

2123

2118

err = security_sb_remount(sb, data);

2124

err = security_sb_remount(sb, data);

2119

if (err)

2125

if (err)

2120

return err;

2126

return err;

2121

2127

2122

down_write(&sb->s_umount);

2128

down_write(&sb->s_umount);

2123

if (flags & MS_BIND)

2129

if (flags & MS_BIND)

2124

err = change_mount_flags(path->mnt, flags);

2130

err = change_mount_flags(path->mnt, flags);

2125

else if (!capable(CAP_SYS_ADMIN))

2131

else if (!capable(CAP_SYS_ADMIN))

2126

err = -EPERM;

2132

err = -EPERM;

2127

else

2133

else

2128

err = do_remount_sb(sb, flags, data, 0);

2134

err = do_remount_sb(sb, flags, data, 0);

2129

if (!err) {

2135

if (!err) {

2130

lock_mount_hash();

2136

lock_mount_hash();

2131

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2137

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2132

mnt->mnt.mnt_flags = mnt_flags;

2138

mnt->mnt.mnt_flags = mnt_flags;

2133

touch_mnt_namespace(mnt->mnt_ns);

2139

touch_mnt_namespace(mnt->mnt_ns);

2134

unlock_mount_hash();

2140

unlock_mount_hash();

2135

}

2141

}

2136

up_write(&sb->s_umount);

2142

up_write(&sb->s_umount);

2137

return err;

2143

return err;

2138

}

2144

}

2139

2145

2140

static inline int tree_contains_unbindable(struct mount *mnt)

2146

static inline int tree_contains_unbindable(struct mount *mnt)

2141

{

2147

{

2142

struct mount *p;

2148

struct mount *p;

2143

for (p = mnt; p; p = next_mnt(p, mnt)) {

2149

for (p = mnt; p; p = next_mnt(p, mnt)) {

2144

if (IS_MNT_UNBINDABLE(p))

2150

if (IS_MNT_UNBINDABLE(p))

2145

return 1;

2151

return 1;

2146

}

2152

}

2147

return 0;

2153

return 0;

2148

}

2154

}

2149

2155

2150

static int do_move_mount(struct path *path, const char *old_name)

2156

static int do_move_mount(struct path *path, const char *old_name)

2151

{

2157

{

2152

struct path old_path, parent_path;

2158

struct path old_path, parent_path;

2153

struct mount *p;

2159

struct mount *p;

2154

struct mount *old;

2160

struct mount *old;

2155

struct mountpoint *mp;

2161

struct mountpoint *mp;

2156

int err;

2162

int err;

2157

if (!old_name || !*old_name)

2163

if (!old_name || !*old_name)

2158

return -EINVAL;

2164

return -EINVAL;

2159

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2165

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2160

if (err)

2166

if (err)

2161

return err;

2167

return err;

2162

2168

2163

mp = lock_mount(path);

2169

mp = lock_mount(path);

2164

err = PTR_ERR(mp);

2170

err = PTR_ERR(mp);

2165

if (IS_ERR(mp))

2171

if (IS_ERR(mp))

2166

goto out;

2172

goto out;

2167

2173

2168

old = real_mount(old_path.mnt);

2174

old = real_mount(old_path.mnt);

2169

p = real_mount(path->mnt);

2175

p = real_mount(path->mnt);

2170

2176

2171

err = -EINVAL;

2177

err = -EINVAL;

2172

if (!check_mnt(p) || !check_mnt(old))

2178

if (!check_mnt(p) || !check_mnt(old))

2173

goto out1;

2179

goto out1;

2174

2180

2175

if (old->mnt.mnt_flags & MNT_LOCKED)

2181

if (old->mnt.mnt_flags & MNT_LOCKED)

2176

goto out1;

2182

goto out1;

2177

2183

2178

err = -EINVAL;

2184

err = -EINVAL;

2179

if (old_path.dentry != old_path.mnt->mnt_root)

2185

if (old_path.dentry != old_path.mnt->mnt_root)

2180

goto out1;

2186

goto out1;

2181

2187

2182

if (!mnt_has_parent(old))

2188

if (!mnt_has_parent(old))

2183

goto out1;

2189

goto out1;

2184

2190

2185

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2191

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2186

S_ISDIR(old_path.dentry->d_inode->i_mode))

2192

S_ISDIR(old_path.dentry->d_inode->i_mode))

2187

goto out1;

2193

goto out1;

2188

/*

2194

/*

2189

* Don't move a mount residing in a shared parent.

2195

* Don't move a mount residing in a shared parent.

2190

*/

2196

*/

2191

if (IS_MNT_SHARED(old->mnt_parent))

2197

if (IS_MNT_SHARED(old->mnt_parent))

2192

goto out1;

2198

goto out1;

2193

/*

2199

/*

2194

* Don't move a mount tree containing unbindable mounts to a destination

2200

* Don't move a mount tree containing unbindable mounts to a destination

2195

* mount which is shared.

2201

* mount which is shared.

2196

*/

2202

*/

2197

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2203

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2198

goto out1;

2204

goto out1;

2199

err = -ELOOP;

2205

err = -ELOOP;

2200

for (; mnt_has_parent(p); p = p->mnt_parent)

2206

for (; mnt_has_parent(p); p = p->mnt_parent)

2201

if (p == old)

2207

if (p == old)

2202

goto out1;

2208

goto out1;

2203

2209

2204

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2210

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2205

if (err)

2211

if (err)

2206

goto out1;

2212

goto out1;

2207

2213

2208

/* if the mount is moved, it should no longer be expire

2214

/* if the mount is moved, it should no longer be expire

2209

* automatically */

2215

* automatically */

2210

list_del_init(&old->mnt_expire);

2216

list_del_init(&old->mnt_expire);

2211

out1:

2217

out1:

2212

unlock_mount(mp);

2218

unlock_mount(mp);

2213

out:

2219

out:

2214

if (!err)

2220

if (!err)

2215

path_put(&parent_path);

2221

path_put(&parent_path);

2216

path_put(&old_path);

2222

path_put(&old_path);

2217

return err;

2223

return err;

2218

}

2224

}

2219

2225

2220

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2226

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2221

{

2227

{

2222

int err;

2228

int err;

2223

const char *subtype = strchr(fstype, '.');

2229

const char *subtype = strchr(fstype, '.');

2224

if (subtype) {

2230

if (subtype) {

2225

subtype++;

2231

subtype++;

2226

err = -EINVAL;

2232

err = -EINVAL;

2227

if (!subtype[0])

2233

if (!subtype[0])

2228

goto err;

2234

goto err;

2229

} else

2235

} else

2230

subtype = "";

2236

subtype = "";

2231

2237

2232

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2238

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2233

err = -ENOMEM;

2239

err = -ENOMEM;

2234

if (!mnt->mnt_sb->s_subtype)

2240

if (!mnt->mnt_sb->s_subtype)

2235

goto err;

2241

goto err;

2236

return mnt;

2242

return mnt;

2237

2243

2238

err:

2244

err:

2239

mntput(mnt);

2245

mntput(mnt);

2240

return ERR_PTR(err);

2246

return ERR_PTR(err);

2241

}

2247

}

2242

2248

2243

/*

2249

/*

2244

* add a mount into a namespace's mount tree

2250

* add a mount into a namespace's mount tree

2245

*/

2251

*/

2246

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2252

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2247

{

2253

{

2248

struct mountpoint *mp;

2254

struct mountpoint *mp;

2249

struct mount *parent;

2255

struct mount *parent;

2250

int err;

2256

int err;

2251

2257

2252

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2258

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2253

2259

2254

mp = lock_mount(path);

2260

mp = lock_mount(path);

2255

if (IS_ERR(mp))

2261

if (IS_ERR(mp))

2256

return PTR_ERR(mp);

2262

return PTR_ERR(mp);

2257

2263

2258

parent = real_mount(path->mnt);

2264

parent = real_mount(path->mnt);

2259

err = -EINVAL;

2265

err = -EINVAL;

2260

if (unlikely(!check_mnt(parent))) {

2266

if (unlikely(!check_mnt(parent))) {

2261

/* that's acceptable only for automounts done in private ns */

2267

/* that's acceptable only for automounts done in private ns */

2262

if (!(mnt_flags & MNT_SHRINKABLE))

2268

if (!(mnt_flags & MNT_SHRINKABLE))

2263

goto unlock;

2269

goto unlock;

2264

/* ... and for those we'd better have mountpoint still alive */

2270

/* ... and for those we'd better have mountpoint still alive */

2265

if (!parent->mnt_ns)

2271

if (!parent->mnt_ns)

2266

goto unlock;

2272

goto unlock;

2267

}

2273

}

2268

2274

2269

/* Refuse the same filesystem on the same mount point */

2275

/* Refuse the same filesystem on the same mount point */

2270

err = -EBUSY;

2276

err = -EBUSY;

2271

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2277

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2272

path->mnt->mnt_root == path->dentry)

2278

path->mnt->mnt_root == path->dentry)

2273

goto unlock;

2279

goto unlock;

2274

2280

2275

err = -EINVAL;

2281

err = -EINVAL;

2276

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2282

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2277

goto unlock;

2283

goto unlock;

2278

2284

2279

newmnt->mnt.mnt_flags = mnt_flags;

2285

newmnt->mnt.mnt_flags = mnt_flags;

2280

err = graft_tree(newmnt, parent, mp);

2286

err = graft_tree(newmnt, parent, mp);

2281

2287

2282

unlock:

2288

unlock:

2283

unlock_mount(mp);

2289

unlock_mount(mp);

2284

return err;

2290

return err;

2285

}

2291

}

2286

2292

2287

/*

2293

/*

2288

* create a new mount for userspace and request it to be added into the

2294

* create a new mount for userspace and request it to be added into the

2289

* namespace's tree

2295

* namespace's tree

2290

*/

2296

*/

2291

static int do_new_mount(struct path *path, const char *fstype, int flags,

2297

static int do_new_mount(struct path *path, const char *fstype, int flags,

2292

int mnt_flags, const char *name, void *data)

2298

int mnt_flags, const char *name, void *data)

2293

{

2299

{

2294

struct file_system_type *type;

2300

struct file_system_type *type;

2295

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2301

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2296

struct vfsmount *mnt;

2302

struct vfsmount *mnt;

2297

int err;

2303

int err;

2298

2304

2299

if (!fstype)

2305

if (!fstype)

2300

return -EINVAL;

2306

return -EINVAL;

2301

2307

2302

type = get_fs_type(fstype);

2308

type = get_fs_type(fstype);

2303

if (!type)

2309

if (!type)

2304

return -ENODEV;

2310

return -ENODEV;

2305

2311

2306

if (user_ns != &init_user_ns) {

2312

if (user_ns != &init_user_ns) {

2307

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2313

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2308

put_filesystem(type);

2314

put_filesystem(type);

2309

return -EPERM;

2315

return -EPERM;

2310

}

2316

}

2311

/* Only in special cases allow devices from mounts

2317

/* Only in special cases allow devices from mounts

2312

* created outside the initial user namespace.

2318

* created outside the initial user namespace.

2313

*/

2319

*/

2314

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2320

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2315

flags |= MS_NODEV;

2321

flags |= MS_NODEV;

2316

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2322

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2317

}

2323

}

2318

}

2324

}

2319

2325

2320

mnt = vfs_kern_mount(type, flags, name, data);

2326

mnt = vfs_kern_mount(type, flags, name, data);

2321

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2327

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2322

!mnt->mnt_sb->s_subtype)

2328

!mnt->mnt_sb->s_subtype)

2323

mnt = fs_set_subtype(mnt, fstype);

2329

mnt = fs_set_subtype(mnt, fstype);

2324

2330

2325

put_filesystem(type);

2331

put_filesystem(type);

2326

if (IS_ERR(mnt))

2332

if (IS_ERR(mnt))

2327

return PTR_ERR(mnt);

2333

return PTR_ERR(mnt);

2328

2334

2329

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2335

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2330

if (err)

2336

if (err)

2331

mntput(mnt);

2337

mntput(mnt);

2332

return err;

2338

return err;

2333

}

2339

}

2334

2340

2335

int finish_automount(struct vfsmount *m, struct path *path)

2341

int finish_automount(struct vfsmount *m, struct path *path)

2336

{

2342

{

2337

struct mount *mnt = real_mount(m);

2343

struct mount *mnt = real_mount(m);

2338

int err;

2344

int err;

2339

/* The new mount record should have at least 2 refs to prevent it being

2345

/* The new mount record should have at least 2 refs to prevent it being

2340

* expired before we get a chance to add it

2346

* expired before we get a chance to add it

2341

*/

2347

*/

2342

BUG_ON(mnt_get_count(mnt) < 2);

2348

BUG_ON(mnt_get_count(mnt) < 2);

2343

2349

2344

if (m->mnt_sb == path->mnt->mnt_sb &&

2350

if (m->mnt_sb == path->mnt->mnt_sb &&

2345

m->mnt_root == path->dentry) {

2351

m->mnt_root == path->dentry) {

2346

err = -ELOOP;

2352

err = -ELOOP;

2347

goto fail;

2353

goto fail;

2348

}

2354

}

2349

2355

2350

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2356

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2351

if (!err)

2357

if (!err)

2352

return 0;

2358

return 0;

2353

fail:

2359

fail:

2354

/* remove m from any expiration list it may be on */

2360

/* remove m from any expiration list it may be on */

2355

if (!list_empty(&mnt->mnt_expire)) {

2361

if (!list_empty(&mnt->mnt_expire)) {

2356

namespace_lock();

2362

namespace_lock();

2357

list_del_init(&mnt->mnt_expire);

2363

list_del_init(&mnt->mnt_expire);

2358

namespace_unlock();

2364

namespace_unlock();

2359

}

2365

}

2360

mntput(m);

2366

mntput(m);

2361

mntput(m);

2367

mntput(m);

2362

return err;

2368

return err;

2363

}

2369

}

2364

2370

2365

/**

2371

/**

2366

* mnt_set_expiry - Put a mount on an expiration list

2372

* mnt_set_expiry - Put a mount on an expiration list

2367

* @mnt: The mount to list.

2373

* @mnt: The mount to list.

2368

* @expiry_list: The list to add the mount to.

2374

* @expiry_list: The list to add the mount to.

2369

*/

2375

*/

2370

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2376

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2371

{

2377

{

2372

namespace_lock();

2378

namespace_lock();

2373

2379

2374

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2380

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2375

2381

2376

namespace_unlock();

2382

namespace_unlock();

2377

}

2383

}

2378

EXPORT_SYMBOL(mnt_set_expiry);

2384

EXPORT_SYMBOL(mnt_set_expiry);

2379

2385

2380

/*

2386

/*

2381

* process a list of expirable mountpoints with the intent of discarding any

2387

* process a list of expirable mountpoints with the intent of discarding any

2382

* mountpoints that aren't in use and haven't been touched since last we came

2388

* mountpoints that aren't in use and haven't been touched since last we came

2383

* here

2389

* here

2384

*/

2390

*/

2385

void mark_mounts_for_expiry(struct list_head *mounts)

2391

void mark_mounts_for_expiry(struct list_head *mounts)

2386

{

2392

{

2387

struct mount *mnt, *next;

2393

struct mount *mnt, *next;

2388

LIST_HEAD(graveyard);

2394

LIST_HEAD(graveyard);

2389

2395

2390

if (list_empty(mounts))

2396

if (list_empty(mounts))

2391

return;

2397

return;

2392

2398

2393

namespace_lock();

2399

namespace_lock();

2394

lock_mount_hash();

2400

lock_mount_hash();

2395

2401

2396

/* extract from the expiration list every vfsmount that matches the

2402

/* extract from the expiration list every vfsmount that matches the

2397

* following criteria:

2403

* following criteria:

2398

* - only referenced by its parent vfsmount

2404

* - only referenced by its parent vfsmount

2399

* - still marked for expiry (marked on the last call here; marks are

2405

* - still marked for expiry (marked on the last call here; marks are

2400

* cleared by mntput())

2406

* cleared by mntput())

2401

*/

2407

*/

2402

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2408

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2403

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2409

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2404

propagate_mount_busy(mnt, 1))

2410

propagate_mount_busy(mnt, 1))

2405

continue;

2411

continue;

2406

list_move(&mnt->mnt_expire, &graveyard);

2412

list_move(&mnt->mnt_expire, &graveyard);

2407

}

2413

}

2408

while (!list_empty(&graveyard)) {

2414

while (!list_empty(&graveyard)) {

2409

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2415

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2410

touch_mnt_namespace(mnt->mnt_ns);

2416

touch_mnt_namespace(mnt->mnt_ns);

2411

umount_tree(mnt, 1);

2417

umount_tree(mnt, 1);

2412

}

2418

}

2413

unlock_mount_hash();

2419

unlock_mount_hash();

2414

namespace_unlock();

2420

namespace_unlock();

2415

}

2421

}

2416

2422

2417

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2423

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2418

2424

2419

/*

2425

/*

2420

* Ripoff of 'select_parent()'

2426

* Ripoff of 'select_parent()'

2421

*

2427

*

2422

* search the list of submounts for a given mountpoint, and move any

2428

* search the list of submounts for a given mountpoint, and move any

2423

* shrinkable submounts to the 'graveyard' list.

2429

* shrinkable submounts to the 'graveyard' list.

2424

*/

2430

*/

2425

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2431

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2426

{

2432

{

2427

struct mount *this_parent = parent;

2433

struct mount *this_parent = parent;

2428

struct list_head *next;

2434

struct list_head *next;

2429

int found = 0;

2435

int found = 0;

2430

2436

2431

repeat:

2437

repeat:

2432

next = this_parent->mnt_mounts.next;

2438

next = this_parent->mnt_mounts.next;

2433

resume:

2439

resume:

2434

while (next != &this_parent->mnt_mounts) {

2440

while (next != &this_parent->mnt_mounts) {

2435

struct list_head *tmp = next;

2441

struct list_head *tmp = next;

2436

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2442

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2437

2443

2438

next = tmp->next;

2444

next = tmp->next;

2439

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2445

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2440

continue;

2446

continue;

2441

/*

2447

/*

2442

* Descend a level if the d_mounts list is non-empty.

2448

* Descend a level if the d_mounts list is non-empty.

2443

*/

2449

*/

2444

if (!list_empty(&mnt->mnt_mounts)) {

2450

if (!list_empty(&mnt->mnt_mounts)) {

2445

this_parent = mnt;

2451

this_parent = mnt;

2446

goto repeat;

2452

goto repeat;

2447

}

2453

}

2448

2454

2449

if (!propagate_mount_busy(mnt, 1)) {

2455

if (!propagate_mount_busy(mnt, 1)) {

2450

list_move_tail(&mnt->mnt_expire, graveyard);

2456

list_move_tail(&mnt->mnt_expire, graveyard);

2451

found++;

2457

found++;

2452

}

2458

}

2453

}

2459

}

2454

/*

2460

/*

2455

* All done at this level ... ascend and resume the search

2461

* All done at this level ... ascend and resume the search

2456

*/

2462

*/

2457

if (this_parent != parent) {

2463

if (this_parent != parent) {

2458

next = this_parent->mnt_child.next;

2464

next = this_parent->mnt_child.next;

2459

this_parent = this_parent->mnt_parent;

2465

this_parent = this_parent->mnt_parent;

2460

goto resume;

2466

goto resume;

2461

}

2467

}

2462

return found;

2468

return found;

2463

}

2469

}

2464

2470

2465

/*

2471

/*

2466

* process a list of expirable mountpoints with the intent of discarding any

2472

* process a list of expirable mountpoints with the intent of discarding any

2467

* submounts of a specific parent mountpoint

2473

* submounts of a specific parent mountpoint

2468

*

2474

*

2469

* mount_lock must be held for write

2475

* mount_lock must be held for write

2470

*/

2476

*/

2471

static void shrink_submounts(struct mount *mnt)

2477

static void shrink_submounts(struct mount *mnt)

2472

{

2478

{

2473

LIST_HEAD(graveyard);

2479

LIST_HEAD(graveyard);

2474

struct mount *m;

2480

struct mount *m;

2475

2481

2476

/* extract submounts of 'mountpoint' from the expiration list */

2482

/* extract submounts of 'mountpoint' from the expiration list */

2477

while (select_submounts(mnt, &graveyard)) {

2483

while (select_submounts(mnt, &graveyard)) {

2478

while (!list_empty(&graveyard)) {

2484

while (!list_empty(&graveyard)) {

2479

m = list_first_entry(&graveyard, struct mount,

2485

m = list_first_entry(&graveyard, struct mount,

2480

mnt_expire);

2486

mnt_expire);

2481

touch_mnt_namespace(m->mnt_ns);

2487

touch_mnt_namespace(m->mnt_ns);

2482

umount_tree(m, 1);

2488

umount_tree(m, 1);

2483

}

2489

}

2484

}

2490

}

2485

}

2491

}

2486

2492

2487

/*

2493

/*

2488

* Some copy_from_user() implementations do not return the exact number of

2494

* Some copy_from_user() implementations do not return the exact number of

2489

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2495

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2490

* Note that this function differs from copy_from_user() in that it will oops

2496

* Note that this function differs from copy_from_user() in that it will oops

2491

* on bad values of `to', rather than returning a short copy.

2497

* on bad values of `to', rather than returning a short copy.

2492

*/

2498

*/

2493

static long exact_copy_from_user(void *to, const void __user * from,

2499

static long exact_copy_from_user(void *to, const void __user * from,

2494

unsigned long n)

2500

unsigned long n)

2495

{

2501

{

2496

char *t = to;

2502

char *t = to;

2497

const char __user *f = from;

2503

const char __user *f = from;

2498

char c;

2504

char c;

2499

2505

2500

if (!access_ok(VERIFY_READ, from, n))

2506

if (!access_ok(VERIFY_READ, from, n))

2501

return n;

2507

return n;

2502

2508

2503

while (n) {

2509

while (n) {

2504

if (__get_user(c, f)) {

2510

if (__get_user(c, f)) {

2505

memset(t, 0, n);

2511

memset(t, 0, n);

2506

break;

2512

break;

2507

}

2513

}

2508

*t++ = c;

2514

*t++ = c;

2509

f++;

2515

f++;

2510

n--;

2516

n--;

2511

}

2517

}

2512

return n;

2518

return n;

2513

}

2519

}

2514

2520

2515

int copy_mount_options(const void __user * data, unsigned long *where)

2521

int copy_mount_options(const void __user * data, unsigned long *where)

2516

{

2522

{

2517

int i;

2523

int i;

2518

unsigned long page;

2524

unsigned long page;

2519

unsigned long size;

2525

unsigned long size;

2520

2526

2521

*where = 0;

2527

*where = 0;

2522

if (!data)

2528

if (!data)

2523

return 0;

2529

return 0;

2524

2530

2525

if (!(page = __get_free_page(GFP_KERNEL)))

2531

if (!(page = __get_free_page(GFP_KERNEL)))

2526

return -ENOMEM;

2532

return -ENOMEM;

2527

2533

2528

/* We only care that *some* data at the address the user

2534

/* We only care that *some* data at the address the user

2529

* gave us is valid. Just in case, we'll zero

2535

* gave us is valid. Just in case, we'll zero

2530

* the remainder of the page.

2536

* the remainder of the page.

2531

*/

2537

*/

2532

/* copy_from_user cannot cross TASK_SIZE ! */

2538

/* copy_from_user cannot cross TASK_SIZE ! */

2533

size = TASK_SIZE - (unsigned long)data;

2539

size = TASK_SIZE - (unsigned long)data;

2534

if (size > PAGE_SIZE)

2540

if (size > PAGE_SIZE)

2535

size = PAGE_SIZE;

2541

size = PAGE_SIZE;

2536

2542

2537

i = size - exact_copy_from_user((void *)page, data, size);

2543

i = size - exact_copy_from_user((void *)page, data, size);

2538

if (!i) {

2544

if (!i) {

2539

free_page(page);

2545

free_page(page);

2540

return -EFAULT;

2546

return -EFAULT;

2541

}

2547

}

2542

if (i != PAGE_SIZE)

2548

if (i != PAGE_SIZE)

2543

memset((char *)page + i, 0, PAGE_SIZE - i);

2549

memset((char *)page + i, 0, PAGE_SIZE - i);

2544

*where = page;

2550

*where = page;

2545

return 0;

2551

return 0;

2546

}

2552

}

2547

2553

2548

char *copy_mount_string(const void __user *data)

2554

char *copy_mount_string(const void __user *data)

2549

{

2555

{

2550

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2556

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2551

}

2557

}

2552

2558

2553

/*

2559

/*

2554

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2560

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2555

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2561

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2556

*

2562

*

2557

* data is a (void *) that can point to any structure up to

2563

* data is a (void *) that can point to any structure up to

2558

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2564

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2559

* information (or be NULL).

2565

* information (or be NULL).

2560

*

2566

*

2561

* Pre-0.97 versions of mount() didn't have a flags word.

2567

* Pre-0.97 versions of mount() didn't have a flags word.

2562

* When the flags word was introduced its top half was required

2568

* When the flags word was introduced its top half was required

2563

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2569

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2564

* Therefore, if this magic number is present, it carries no information

2570

* Therefore, if this magic number is present, it carries no information

2565

* and must be discarded.

2571

* and must be discarded.

2566

*/

2572

*/

2567

long do_mount(const char *dev_name, const char __user *dir_name,

2573

long do_mount(const char *dev_name, const char __user *dir_name,

2568

const char *type_page, unsigned long flags, void *data_page)

2574

const char *type_page, unsigned long flags, void *data_page)

2569

{

2575

{

2570

struct path path;

2576

struct path path;

2571

int retval = 0;

2577

int retval = 0;

2572

int mnt_flags = 0;

2578

int mnt_flags = 0;

2573

2579

2574

/* Discard magic */

2580

/* Discard magic */

2575

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2581

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2576

flags &= ~MS_MGC_MSK;

2582

flags &= ~MS_MGC_MSK;

2577

2583

2578

/* Basic sanity checks */

2584

/* Basic sanity checks */

2579

if (data_page)

2585

if (data_page)

2580

((char *)data_page)[PAGE_SIZE - 1] = 0;

2586

((char *)data_page)[PAGE_SIZE - 1] = 0;

2581

2587

2582

/* ... and get the mountpoint */

2588

/* ... and get the mountpoint */

2583

retval = user_path(dir_name, &path);

2589

retval = user_path(dir_name, &path);

2584

if (retval)

2590

if (retval)

2585

return retval;

2591

return retval;

2586

2592

2587

retval = security_sb_mount(dev_name, &path,

2593

retval = security_sb_mount(dev_name, &path,

2588

type_page, flags, data_page);

2594

type_page, flags, data_page);

2589

if (!retval && !may_mount())

2595

if (!retval && !may_mount())

2590

retval = -EPERM;

2596

retval = -EPERM;

2591

if (retval)

2597

if (retval)

2592

goto dput_out;

2598

goto dput_out;

2593

2599

2594

/* Default to relatime unless overriden */

2600

/* Default to relatime unless overriden */

2595

if (!(flags & MS_NOATIME))

2601

if (!(flags & MS_NOATIME))

2596

mnt_flags |= MNT_RELATIME;

2602

mnt_flags |= MNT_RELATIME;

2597

2603

2598

/* Separate the per-mountpoint flags */

2604

/* Separate the per-mountpoint flags */

2599

if (flags & MS_NOSUID)

2605

if (flags & MS_NOSUID)

2600

mnt_flags |= MNT_NOSUID;

2606

mnt_flags |= MNT_NOSUID;

2601

if (flags & MS_NODEV)

2607

if (flags & MS_NODEV)

2602

mnt_flags |= MNT_NODEV;

2608

mnt_flags |= MNT_NODEV;

2603

if (flags & MS_NOEXEC)

2609

if (flags & MS_NOEXEC)

2604

mnt_flags |= MNT_NOEXEC;

2610

mnt_flags |= MNT_NOEXEC;

2605

if (flags & MS_NOATIME)

2611

if (flags & MS_NOATIME)

2606

mnt_flags |= MNT_NOATIME;

2612

mnt_flags |= MNT_NOATIME;

2607

if (flags & MS_NODIRATIME)

2613

if (flags & MS_NODIRATIME)

2608

mnt_flags |= MNT_NODIRATIME;

2614

mnt_flags |= MNT_NODIRATIME;

2609

if (flags & MS_STRICTATIME)

2615

if (flags & MS_STRICTATIME)

2610

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2616

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2611

if (flags & MS_RDONLY)

2617

if (flags & MS_RDONLY)

2612

mnt_flags |= MNT_READONLY;

2618

mnt_flags |= MNT_READONLY;

2613

2619

2614

/* The default atime for remount is preservation */

2620

/* The default atime for remount is preservation */

2615

if ((flags & MS_REMOUNT) &&

2621

if ((flags & MS_REMOUNT) &&

2616

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2622

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2617

MS_STRICTATIME)) == 0)) {

2623

MS_STRICTATIME)) == 0)) {

2618

mnt_flags &= ~MNT_ATIME_MASK;

2624

mnt_flags &= ~MNT_ATIME_MASK;

2619

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2625

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2620

}

2626

}

2621

2627

2622

2628

2623

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2629

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2624

MS_STRICTATIME);

2630

MS_STRICTATIME);

2625

2631

2626

if (flags & MS_REMOUNT)

2632

if (flags & MS_REMOUNT)

2627

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2633

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2628

data_page);

2634

data_page);

2629

else if (flags & MS_BIND)

2635

else if (flags & MS_BIND)

2630

retval = do_loopback(&path, dev_name, flags & MS_REC);

2636

retval = do_loopback(&path, dev_name, flags & MS_REC);

2631

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2637

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2632

retval = do_change_type(&path, flags);

2638

retval = do_change_type(&path, flags);

2633

else if (flags & MS_MOVE)

2639

else if (flags & MS_MOVE)

2634

retval = do_move_mount(&path, dev_name);

2640

retval = do_move_mount(&path, dev_name);

2635

else

2641

else

2636

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2642

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2637

dev_name, data_page);

2643

dev_name, data_page);

2638

dput_out:

2644

dput_out:

2639

path_put(&path);

2645

path_put(&path);

2640

return retval;

2646

return retval;

2641

}

2647

}

2642

2648

2643

static void free_mnt_ns(struct mnt_namespace *ns)

2649

static void free_mnt_ns(struct mnt_namespace *ns)

2644

{

2650

{

2645

proc_free_inum(ns->proc_inum);

2651

proc_free_inum(ns->proc_inum);

2646

put_user_ns(ns->user_ns);

2652

put_user_ns(ns->user_ns);

2647

kfree(ns);

2653

kfree(ns);

2648

}

2654

}

2649

2655

2650

/*

2656

/*

2651

* Assign a sequence number so we can detect when we attempt to bind

2657

* Assign a sequence number so we can detect when we attempt to bind

2652

* mount a reference to an older mount namespace into the current

2658

* mount a reference to an older mount namespace into the current

2653

* mount namespace, preventing reference counting loops. A 64bit

2659

* mount namespace, preventing reference counting loops. A 64bit

2654

* number incrementing at 10Ghz will take 12,427 years to wrap which

2660

* number incrementing at 10Ghz will take 12,427 years to wrap which

2655

* is effectively never, so we can ignore the possibility.

2661

* is effectively never, so we can ignore the possibility.

2656

*/

2662

*/

2657

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2663

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2658

2664

2659

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2665

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2660

{

2666

{

2661

struct mnt_namespace *new_ns;

2667

struct mnt_namespace *new_ns;

2662

int ret;

2668

int ret;

2663

2669

2664

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2670

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2665

if (!new_ns)

2671

if (!new_ns)

2666

return ERR_PTR(-ENOMEM);

2672

return ERR_PTR(-ENOMEM);

2667

ret = proc_alloc_inum(&new_ns->proc_inum);

2673

ret = proc_alloc_inum(&new_ns->proc_inum);

2668

if (ret) {

2674

if (ret) {

2669

kfree(new_ns);

2675

kfree(new_ns);

2670

return ERR_PTR(ret);

2676

return ERR_PTR(ret);

2671

}

2677

}

2672

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2678

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2673

atomic_set(&new_ns->count, 1);

2679

atomic_set(&new_ns->count, 1);

2674

new_ns->root = NULL;

2680

new_ns->root = NULL;

2675

INIT_LIST_HEAD(&new_ns->list);

2681

INIT_LIST_HEAD(&new_ns->list);

2676

init_waitqueue_head(&new_ns->poll);

2682

init_waitqueue_head(&new_ns->poll);

2677

new_ns->event = 0;

2683

new_ns->event = 0;

2678

new_ns->user_ns = get_user_ns(user_ns);

2684

new_ns->user_ns = get_user_ns(user_ns);

2679

return new_ns;

2685

return new_ns;

2680

}

2686

}

2681

2687

2682

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2688

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2683

struct user_namespace *user_ns, struct fs_struct *new_fs)

2689

struct user_namespace *user_ns, struct fs_struct *new_fs)

2684

{

2690

{

2685

struct mnt_namespace *new_ns;

2691

struct mnt_namespace *new_ns;

2686

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2692

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2687

struct mount *p, *q;

2693

struct mount *p, *q;

2688

struct mount *old;

2694

struct mount *old;

2689

struct mount *new;

2695

struct mount *new;

2690

int copy_flags;

2696

int copy_flags;

2691

2697

2692

BUG_ON(!ns);

2698

BUG_ON(!ns);

2693

2699

2694

if (likely(!(flags & CLONE_NEWNS))) {

2700

if (likely(!(flags & CLONE_NEWNS))) {

2695

get_mnt_ns(ns);

2701

get_mnt_ns(ns);

2696

return ns;

2702

return ns;

2697

}

2703

}

2698

2704

2699

old = ns->root;

2705

old = ns->root;

2700

2706

2701

new_ns = alloc_mnt_ns(user_ns);

2707

new_ns = alloc_mnt_ns(user_ns);

2702

if (IS_ERR(new_ns))

2708

if (IS_ERR(new_ns))

2703

return new_ns;

2709

return new_ns;

2704

2710

2705

namespace_lock();

2711

namespace_lock();

2706

/* First pass: copy the tree topology */

2712

/* First pass: copy the tree topology */

2707

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2713

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2708

if (user_ns != ns->user_ns)

2714

if (user_ns != ns->user_ns)

2709

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2715

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2710

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2716

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2711

if (IS_ERR(new)) {

2717

if (IS_ERR(new)) {

2712

namespace_unlock();

2718

namespace_unlock();

2713

free_mnt_ns(new_ns);

2719

free_mnt_ns(new_ns);

2714

return ERR_CAST(new);

2720

return ERR_CAST(new);

2715

}

2721

}

2716

new_ns->root = new;

2722

new_ns->root = new;

2717

list_add_tail(&new_ns->list, &new->mnt_list);

2723

list_add_tail(&new_ns->list, &new->mnt_list);

2718

2724

2719

/*

2725

/*

2720

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2726

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2721

* as belonging to new namespace. We have already acquired a private

2727

* as belonging to new namespace. We have already acquired a private

2722

* fs_struct, so tsk->fs->lock is not needed.

2728

* fs_struct, so tsk->fs->lock is not needed.

2723

*/

2729

*/

2724

p = old;

2730

p = old;

2725

q = new;

2731

q = new;

2726

while (p) {

2732

while (p) {

2727

q->mnt_ns = new_ns;

2733

q->mnt_ns = new_ns;

2728

if (new_fs) {

2734

if (new_fs) {

2729

if (&p->mnt == new_fs->root.mnt) {

2735

if (&p->mnt == new_fs->root.mnt) {

2730

new_fs->root.mnt = mntget(&q->mnt);

2736

new_fs->root.mnt = mntget(&q->mnt);

2731

rootmnt = &p->mnt;

2737

rootmnt = &p->mnt;

2732

}

2738

}

2733

if (&p->mnt == new_fs->pwd.mnt) {

2739

if (&p->mnt == new_fs->pwd.mnt) {

2734

new_fs->pwd.mnt = mntget(&q->mnt);

2740

new_fs->pwd.mnt = mntget(&q->mnt);

2735

pwdmnt = &p->mnt;

2741

pwdmnt = &p->mnt;

2736

}

2742

}

2737

}

2743

}

2738

p = next_mnt(p, old);

2744

p = next_mnt(p, old);

2739

q = next_mnt(q, new);

2745

q = next_mnt(q, new);

2740

if (!q)

2746

if (!q)

2741

break;

2747

break;

2742

while (p->mnt.mnt_root != q->mnt.mnt_root)

2748

while (p->mnt.mnt_root != q->mnt.mnt_root)

2743

p = next_mnt(p, old);

2749

p = next_mnt(p, old);

2744

}

2750

}

2745

namespace_unlock();

2751

namespace_unlock();

2746

2752

2747

if (rootmnt)

2753

if (rootmnt)

2748

mntput(rootmnt);

2754

mntput(rootmnt);

2749

if (pwdmnt)

2755

if (pwdmnt)

2750

mntput(pwdmnt);

2756

mntput(pwdmnt);

2751

2757

2752

return new_ns;

2758

return new_ns;

2753

}

2759

}

2754

2760

2755

/**

2761

/**

2756

* create_mnt_ns - creates a private namespace and adds a root filesystem

2762

* create_mnt_ns - creates a private namespace and adds a root filesystem

2757

* @mnt: pointer to the new root filesystem mountpoint

2763

* @mnt: pointer to the new root filesystem mountpoint

2758

*/

2764

*/

2759

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2765

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2760

{

2766

{

2761

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2767

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2762

if (!IS_ERR(new_ns)) {

2768

if (!IS_ERR(new_ns)) {

2763

struct mount *mnt = real_mount(m);

2769

struct mount *mnt = real_mount(m);

2764

mnt->mnt_ns = new_ns;

2770

mnt->mnt_ns = new_ns;

2765

new_ns->root = mnt;

2771

new_ns->root = mnt;

2766

list_add(&mnt->mnt_list, &new_ns->list);

2772

list_add(&mnt->mnt_list, &new_ns->list);

2767

} else {

2773

} else {

2768

mntput(m);

2774

mntput(m);

2769

}

2775

}

2770

return new_ns;

2776

return new_ns;

2771

}

2777

}

2772

2778

2773

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2779

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2774

{

2780

{

2775

struct mnt_namespace *ns;

2781

struct mnt_namespace *ns;

2776

struct super_block *s;

2782

struct super_block *s;

2777

struct path path;

2783

struct path path;

2778

int err;

2784

int err;

2779

2785

2780

ns = create_mnt_ns(mnt);

2786

ns = create_mnt_ns(mnt);

2781

if (IS_ERR(ns))

2787

if (IS_ERR(ns))

2782

return ERR_CAST(ns);

2788

return ERR_CAST(ns);

2783

2789

2784

err = vfs_path_lookup(mnt->mnt_root, mnt,

2790

err = vfs_path_lookup(mnt->mnt_root, mnt,

2785

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2791

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2786

2792

2787

put_mnt_ns(ns);

2793

put_mnt_ns(ns);

2788

2794

2789

if (err)

2795

if (err)

2790

return ERR_PTR(err);

2796

return ERR_PTR(err);

2791

2797

2792

/* trade a vfsmount reference for active sb one */

2798

/* trade a vfsmount reference for active sb one */

2793

s = path.mnt->mnt_sb;

2799

s = path.mnt->mnt_sb;

2794

atomic_inc(&s->s_active);

2800

atomic_inc(&s->s_active);

2795

mntput(path.mnt);

2801

mntput(path.mnt);

2796

/* lock the sucker */

2802

/* lock the sucker */

2797

down_write(&s->s_umount);

2803

down_write(&s->s_umount);

2798

/* ... and return the root of (sub)tree on it */

2804

/* ... and return the root of (sub)tree on it */

2799

return path.dentry;

2805

return path.dentry;

2800

}

2806

}

2801

EXPORT_SYMBOL(mount_subtree);

2807

EXPORT_SYMBOL(mount_subtree);

2802

2808

2803

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2809

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2804

char __user *, type, unsigned long, flags, void __user *, data)

2810

char __user *, type, unsigned long, flags, void __user *, data)

2805

{

2811

{

2806

int ret;

2812

int ret;

2807

char *kernel_type;

2813

char *kernel_type;

2808

char *kernel_dev;

2814

char *kernel_dev;

2809

unsigned long data_page;

2815

unsigned long data_page;

2810

2816

2811

kernel_type = copy_mount_string(type);

2817

kernel_type = copy_mount_string(type);

2812

ret = PTR_ERR(kernel_type);

2818

ret = PTR_ERR(kernel_type);

2813

if (IS_ERR(kernel_type))

2819

if (IS_ERR(kernel_type))

2814

goto out_type;

2820

goto out_type;

2815

2821

2816

kernel_dev = copy_mount_string(dev_name);

2822

kernel_dev = copy_mount_string(dev_name);

2817

ret = PTR_ERR(kernel_dev);

2823

ret = PTR_ERR(kernel_dev);

2818

if (IS_ERR(kernel_dev))

2824

if (IS_ERR(kernel_dev))

2819

goto out_dev;

2825

goto out_dev;

2820

2826

2821

ret = copy_mount_options(data, &data_page);

2827

ret = copy_mount_options(data, &data_page);

2822

if (ret < 0)

2828

if (ret < 0)

2823

goto out_data;

2829

goto out_data;

2824

2830

2825

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2831

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2826

(void *) data_page);

2832

(void *) data_page);

2827

2833

2828

free_page(data_page);

2834

free_page(data_page);

2829

out_data:

2835

out_data:

2830

kfree(kernel_dev);

2836

kfree(kernel_dev);

2831

out_dev:

2837

out_dev:

2832

kfree(kernel_type);

2838

kfree(kernel_type);

2833

out_type:

2839

out_type:

2834

return ret;

2840

return ret;

2835

}

2841

}

2836

2842

2837

/*

2843

/*

2838

* Return true if path is reachable from root

2844

* Return true if path is reachable from root

2839

*

2845

*

2840

* namespace_sem or mount_lock is held

2846

* namespace_sem or mount_lock is held

2841

*/

2847

*/

2842

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2848

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2843

const struct path *root)

2849

const struct path *root)

2844

{

2850

{

2845

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2851

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2846

dentry = mnt->mnt_mountpoint;

2852

dentry = mnt->mnt_mountpoint;

2847

mnt = mnt->mnt_parent;

2853

mnt = mnt->mnt_parent;

2848

}

2854

}

2849

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2855

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2850

}

2856

}

2851

2857

2852

int path_is_under(struct path *path1, struct path *path2)

2858

int path_is_under(struct path *path1, struct path *path2)

2853

{

2859

{

2854

int res;

2860

int res;

2855

read_seqlock_excl(&mount_lock);

2861

read_seqlock_excl(&mount_lock);

2856

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2862

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2857

read_sequnlock_excl(&mount_lock);

2863

read_sequnlock_excl(&mount_lock);

2858

return res;

2864

return res;

2859

}

2865

}

2860

EXPORT_SYMBOL(path_is_under);

2866

EXPORT_SYMBOL(path_is_under);

2861

2867

2862

/*

2868

/*

2863

* pivot_root Semantics:

2869

* pivot_root Semantics:

2864

* Moves the root file system of the current process to the directory put_old,

2870

* Moves the root file system of the current process to the directory put_old,

2865

* makes new_root as the new root file system of the current process, and sets

2871

* makes new_root as the new root file system of the current process, and sets

2866

* root/cwd of all processes which had them on the current root to new_root.

2872

* root/cwd of all processes which had them on the current root to new_root.

2867

*

2873

*

2868

* Restrictions:

2874

* Restrictions:

2869

* The new_root and put_old must be directories, and must not be on the

2875

* The new_root and put_old must be directories, and must not be on the

2870

* same file system as the current process root. The put_old must be

2876

* same file system as the current process root. The put_old must be

2871

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2877

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2872

* pointed to by put_old must yield the same directory as new_root. No other

2878

* pointed to by put_old must yield the same directory as new_root. No other

2873

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2879

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2874

*

2880

*

2875

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2881

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2876

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2882

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2877

* in this situation.

2883

* in this situation.

2878

*

2884

*

2879

* Notes:

2885

* Notes:

2880

* - we don't move root/cwd if they are not at the root (reason: if something

2886

* - we don't move root/cwd if they are not at the root (reason: if something

2881

* cared enough to change them, it's probably wrong to force them elsewhere)

2887

* cared enough to change them, it's probably wrong to force them elsewhere)

2882

* - it's okay to pick a root that isn't the root of a file system, e.g.

2888

* - it's okay to pick a root that isn't the root of a file system, e.g.

2883

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2889

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2884

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2890

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2885

* first.

2891

* first.

2886

*/

2892

*/

2887

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2893

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2888

const char __user *, put_old)

2894

const char __user *, put_old)

2889

{

2895

{

2890

struct path new, old, parent_path, root_parent, root;

2896

struct path new, old, parent_path, root_parent, root;

2891

struct mount *new_mnt, *root_mnt, *old_mnt;

2897

struct mount *new_mnt, *root_mnt, *old_mnt;

2892

struct mountpoint *old_mp, *root_mp;

2898

struct mountpoint *old_mp, *root_mp;

2893

int error;

2899

int error;

2894

2900

2895

if (!may_mount())

2901

if (!may_mount())

2896

return -EPERM;

2902

return -EPERM;

2897

2903

2898

error = user_path_dir(new_root, &new);

2904

error = user_path_dir(new_root, &new);

2899

if (error)

2905

if (error)

2900

goto out0;

2906

goto out0;

2901

2907

2902

error = user_path_dir(put_old, &old);

2908

error = user_path_dir(put_old, &old);

2903

if (error)

2909

if (error)

2904

goto out1;

2910

goto out1;

2905

2911

2906

error = security_sb_pivotroot(&old, &new);

2912

error = security_sb_pivotroot(&old, &new);

2907

if (error)

2913

if (error)

2908

goto out2;

2914

goto out2;

2909

2915

2910

get_fs_root(current->fs, &root);

2916

get_fs_root(current->fs, &root);

2911

old_mp = lock_mount(&old);

2917

old_mp = lock_mount(&old);

2912

error = PTR_ERR(old_mp);

2918

error = PTR_ERR(old_mp);

2913

if (IS_ERR(old_mp))

2919

if (IS_ERR(old_mp))

2914

goto out3;

2920

goto out3;

2915

2921

2916

error = -EINVAL;

2922

error = -EINVAL;

2917

new_mnt = real_mount(new.mnt);

2923

new_mnt = real_mount(new.mnt);

2918

root_mnt = real_mount(root.mnt);

2924

root_mnt = real_mount(root.mnt);

2919

old_mnt = real_mount(old.mnt);

2925

old_mnt = real_mount(old.mnt);

2920

if (IS_MNT_SHARED(old_mnt) ||

2926

if (IS_MNT_SHARED(old_mnt) ||

2921

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2927

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2922

IS_MNT_SHARED(root_mnt->mnt_parent))

2928

IS_MNT_SHARED(root_mnt->mnt_parent))

2923

goto out4;

2929

goto out4;

2924

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2930

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2925

goto out4;

2931

goto out4;

2926

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2932

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2927

goto out4;

2933

goto out4;

2928

error = -ENOENT;

2934

error = -ENOENT;

2929

if (d_unlinked(new.dentry))

2935

if (d_unlinked(new.dentry))

2930

goto out4;

2936

goto out4;

2931

error = -EBUSY;

2937

error = -EBUSY;

2932

if (new_mnt == root_mnt || old_mnt == root_mnt)

2938

if (new_mnt == root_mnt || old_mnt == root_mnt)

2933

goto out4; /* loop, on the same file system */

2939

goto out4; /* loop, on the same file system */

2934

error = -EINVAL;

2940

error = -EINVAL;

2935

if (root.mnt->mnt_root != root.dentry)

2941

if (root.mnt->mnt_root != root.dentry)

2936

goto out4; /* not a mountpoint */

2942

goto out4; /* not a mountpoint */

2937

if (!mnt_has_parent(root_mnt))

2943

if (!mnt_has_parent(root_mnt))

2938

goto out4; /* not attached */

2944

goto out4; /* not attached */

2939

root_mp = root_mnt->mnt_mp;

2945

root_mp = root_mnt->mnt_mp;

2940

if (new.mnt->mnt_root != new.dentry)

2946

if (new.mnt->mnt_root != new.dentry)

2941

goto out4; /* not a mountpoint */

2947

goto out4; /* not a mountpoint */

2942

if (!mnt_has_parent(new_mnt))

2948

if (!mnt_has_parent(new_mnt))

2943

goto out4; /* not attached */

2949

goto out4; /* not attached */

2944

/* make sure we can reach put_old from new_root */

2950

/* make sure we can reach put_old from new_root */

2945

if (!is_path_reachable(old_mnt, old.dentry, &new))

2951

if (!is_path_reachable(old_mnt, old.dentry, &new))

2946

goto out4;

2952

goto out4;

2947

/* make certain new is below the root */

2953

/* make certain new is below the root */

2948

if (!is_path_reachable(new_mnt, new.dentry, &root))

2954

if (!is_path_reachable(new_mnt, new.dentry, &root))

2949

goto out4;

2955

goto out4;

2950

root_mp->m_count++; /* pin it so it won't go away */

2956

root_mp->m_count++; /* pin it so it won't go away */

2951

lock_mount_hash();

2957

lock_mount_hash();

2952

detach_mnt(new_mnt, &parent_path);

2958

detach_mnt(new_mnt, &parent_path);

2953

detach_mnt(root_mnt, &root_parent);

2959

detach_mnt(root_mnt, &root_parent);

2954

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2960

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2955

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2961

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2956

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2962

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2957

}

2963

}

2958

/* mount old root on put_old */

2964

/* mount old root on put_old */

2959

attach_mnt(root_mnt, old_mnt, old_mp);

2965

attach_mnt(root_mnt, old_mnt, old_mp);

2960

/* mount new_root on / */

2966

/* mount new_root on / */

2961

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2967

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2962

touch_mnt_namespace(current->nsproxy->mnt_ns);

2968

touch_mnt_namespace(current->nsproxy->mnt_ns);

2963

unlock_mount_hash();

2969

unlock_mount_hash();

2964

chroot_fs_refs(&root, &new);

2970

chroot_fs_refs(&root, &new);

2965

put_mountpoint(root_mp);

2971

put_mountpoint(root_mp);

2966

error = 0;

2972

error = 0;

2967

out4:

2973

out4:

2968

unlock_mount(old_mp);

2974

unlock_mount(old_mp);

2969

if (!error) {

2975

if (!error) {

2970

path_put(&root_parent);

2976

path_put(&root_parent);

2971

path_put(&parent_path);

2977

path_put(&parent_path);

2972

}

2978

}

2973

out3:

2979

out3:

2974

path_put(&root);

2980

path_put(&root);

2975

out2:

2981

out2:

2976

path_put(&old);

2982

path_put(&old);

2977

out1:

2983

out1:

2978

path_put(&new);

2984

path_put(&new);

2979

out0:

2985

out0:

2980

return error;

2986

return error;

2981

}

2987

}

2982

2988

2983

static void __init init_mount_tree(void)

2989

static void __init init_mount_tree(void)

2984

{

2990

{

2985

struct vfsmount *mnt;

2991

struct vfsmount *mnt;

2986

struct mnt_namespace *ns;

2992

struct mnt_namespace *ns;

2987

struct path root;

2993

struct path root;

2988

struct file_system_type *type;

2994

struct file_system_type *type;

2989

2995

2990

type = get_fs_type("rootfs");

2996

type = get_fs_type("rootfs");

2991

if (!type)

2997

if (!type)

2992

panic("Can't find rootfs type");

2998

panic("Can't find rootfs type");

2993

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2999

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2994

put_filesystem(type);

3000

put_filesystem(type);

2995

if (IS_ERR(mnt))

3001

if (IS_ERR(mnt))

2996

panic("Can't create rootfs");

3002

panic("Can't create rootfs");

2997

3003

2998

ns = create_mnt_ns(mnt);

3004

ns = create_mnt_ns(mnt);

2999

if (IS_ERR(ns))

3005

if (IS_ERR(ns))

3000

panic("Can't allocate initial namespace");

3006

panic("Can't allocate initial namespace");

3001

3007

3002

init_task.nsproxy->mnt_ns = ns;

3008

init_task.nsproxy->mnt_ns = ns;

3003

get_mnt_ns(ns);

3009

get_mnt_ns(ns);

3004

3010

3005

root.mnt = mnt;

3011

root.mnt = mnt;

3006

root.dentry = mnt->mnt_root;

3012

root.dentry = mnt->mnt_root;

3007

3013

3008

set_fs_pwd(current->fs, &root);

3014

set_fs_pwd(current->fs, &root);

3009

set_fs_root(current->fs, &root);

3015

set_fs_root(current->fs, &root);

3010

}

3016

}

3011

3017

3012

void __init mnt_init(void)

3018

void __init mnt_init(void)

3013

{

3019

{

3014

unsigned u;

3020

unsigned u;

3015

int err;

3021

int err;

3016

3022

3017

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3023

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3018

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3024

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3019

3025

3020

mount_hashtable = alloc_large_system_hash("Mount-cache",

3026

mount_hashtable = alloc_large_system_hash("Mount-cache",

3021

sizeof(struct hlist_head),

3027

sizeof(struct hlist_head),

3022

mhash_entries, 19,

3028

mhash_entries, 19,

3023

0,

3029

0,

3024

&m_hash_shift, &m_hash_mask, 0, 0);

3030

&m_hash_shift, &m_hash_mask, 0, 0);

3025

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3031

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3026

sizeof(struct hlist_head),

3032

sizeof(struct hlist_head),

3027

mphash_entries, 19,

3033

mphash_entries, 19,

3028

0,

3034

0,

3029

&mp_hash_shift, &mp_hash_mask, 0, 0);

3035

&mp_hash_shift, &mp_hash_mask, 0, 0);

3030

3036

3031

if (!mount_hashtable || !mountpoint_hashtable)

3037

if (!mount_hashtable || !mountpoint_hashtable)

3032

panic("Failed to allocate mount hash table\n");

3038

panic("Failed to allocate mount hash table\n");

3033

3039

3034

for (u = 0; u <= m_hash_mask; u++)

3040

for (u = 0; u <= m_hash_mask; u++)

3035

INIT_HLIST_HEAD(&mount_hashtable[u]);

3041

INIT_HLIST_HEAD(&mount_hashtable[u]);

3036

for (u = 0; u <= mp_hash_mask; u++)

3042

for (u = 0; u <= mp_hash_mask; u++)

3037

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3043

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3038

3044

3039

kernfs_init();

3045

kernfs_init();

3040

3046

3041

err = sysfs_init();

3047

err = sysfs_init();

3042

if (err)

3048

if (err)

3043

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3049

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3044

__func__, err);

3050

__func__, err);

3045

fs_kobj = kobject_create_and_add("fs", NULL);

3051

fs_kobj = kobject_create_and_add("fs", NULL);

3046

if (!fs_kobj)

3052

if (!fs_kobj)

3047

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3053

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3048

init_rootfs();

3054

init_rootfs();

3049

init_mount_tree();

3055

init_mount_tree();

3050

}

3056

}

3051

3057

3052

void put_mnt_ns(struct mnt_namespace *ns)

3058

void put_mnt_ns(struct mnt_namespace *ns)

3053

{

3059

{

3054

if (!atomic_dec_and_test(&ns->count))

3060

if (!atomic_dec_and_test(&ns->count))

3055

return;

3061

return;

3056

drop_collected_mounts(&ns->root->mnt);

3062

drop_collected_mounts(&ns->root->mnt);

3057

free_mnt_ns(ns);

3063

free_mnt_ns(ns);

3058

}

3064

}

3059

3065

3060

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3066

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3061

{

3067

{

3062

struct vfsmount *mnt;

3068

struct vfsmount *mnt;

3063

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3069

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3064

if (!IS_ERR(mnt)) {

3070

if (!IS_ERR(mnt)) {

3065

/*

3071

/*

3066

* it is a longterm mount, don't release mnt until

3072

* it is a longterm mount, don't release mnt until

3067

* we unmount before file sys is unregistered

3073

* we unmount before file sys is unregistered

3068

*/

3074

*/

3069

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3075

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3070

}

3076

}

3071

return mnt;

3077

return mnt;

3072

}

3078

}

3073

EXPORT_SYMBOL_GPL(kern_mount_data);

3079

EXPORT_SYMBOL_GPL(kern_mount_data);

3074

3080

3075

void kern_unmount(struct vfsmount *mnt)

3081

void kern_unmount(struct vfsmount *mnt)

3076

{

3082

{

3077

/* release long term mount so mount point can be released */

3083

/* release long term mount so mount point can be released */

3078

if (!IS_ERR_OR_NULL(mnt)) {

3084

if (!IS_ERR_OR_NULL(mnt)) {

3079

real_mount(mnt)->mnt_ns = NULL;

3085

real_mount(mnt)->mnt_ns = NULL;

3080

synchronize_rcu(); /* yecchhh... */

3086

synchronize_rcu(); /* yecchhh... */

3081

mntput(mnt);

3087

mntput(mnt);

3082

}

3088

}

3083

}

3089

}

3084

EXPORT_SYMBOL(kern_unmount);

3090

EXPORT_SYMBOL(kern_unmount);

3085

3091

3086

bool our_mnt(struct vfsmount *mnt)

3092

bool our_mnt(struct vfsmount *mnt)

3087

{

3093

{

3088

return check_mnt(real_mount(mnt));

3094

return check_mnt(real_mount(mnt));

3089

}

3095

}

3090

3096

3091

bool current_chrooted(void)

3097

bool current_chrooted(void)

3092

{

3098

{

3093

/* Does the current process have a non-standard root */

3099

/* Does the current process have a non-standard root */

3094

struct path ns_root;

3100

struct path ns_root;

3095

struct path fs_root;

3101

struct path fs_root;

3096

bool chrooted;

3102

bool chrooted;

3097

3103

3098

/* Find the namespace root */

3104

/* Find the namespace root */

3099

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3105

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3100

ns_root.dentry = ns_root.mnt->mnt_root;

3106

ns_root.dentry = ns_root.mnt->mnt_root;

3101

path_get(&ns_root);

3107

path_get(&ns_root);

3102

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3108

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3103

;

3109

;

3104

3110

3105

get_fs_root(current->fs, &fs_root);

3111

get_fs_root(current->fs, &fs_root);

3106

3112

3107

chrooted = !path_equal(&fs_root, &ns_root);

3113

chrooted = !path_equal(&fs_root, &ns_root);

3108

3114

3109

path_put(&fs_root);

3115

path_put(&fs_root);

3110

path_put(&ns_root);

3116

path_put(&ns_root);

3111

3117

3112

return chrooted;

3118

return chrooted;

3113

}

3119

}

3114

3120

3115

bool fs_fully_visible(struct file_system_type *type)

3121

bool fs_fully_visible(struct file_system_type *type)

3116

{

3122

{

3117

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3123

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3118

struct mount *mnt;

3124

struct mount *mnt;

3119

bool visible = false;

3125

bool visible = false;

3120

3126

3121

if (unlikely(!ns))

3127

if (unlikely(!ns))

3122

return false;

3128

return false;

3123

3129

3124

down_read(&namespace_sem);

3130

down_read(&namespace_sem);

3125

list_for_each_entry(mnt, &ns->list, mnt_list) {

3131

list_for_each_entry(mnt, &ns->list, mnt_list) {

3126

struct mount *child;

3132

struct mount *child;

3127

if (mnt->mnt.mnt_sb->s_type != type)

3133

if (mnt->mnt.mnt_sb->s_type != type)

3128

continue;

3134

continue;

3129

3135

3130

/* This mount is not fully visible if there are any child mounts

3136

/* This mount is not fully visible if there are any child mounts

3131

* that cover anything except for empty directories.

3137

* that cover anything except for empty directories.

3132

*/

3138

*/

3133

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3139

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3134

struct inode *inode = child->mnt_mountpoint->d_inode;

3140

struct inode *inode = child->mnt_mountpoint->d_inode;

3135

if (!S_ISDIR(inode->i_mode))

3141

if (!S_ISDIR(inode->i_mode))

3136

goto next;

3142

goto next;

3137

if (inode->i_nlink > 2)

3143

if (inode->i_nlink > 2)

3138

goto next;

3144

goto next;

3139

}

3145

}

3140

visible = true;

3146

visible = true;

3141

goto found;

3147

goto found;

3142

next: ;

3148

next: ;

3143

}

3149

}

3144

found:

3150

found:

3145

up_read(&namespace_sem);

3151

up_read(&namespace_sem);

3146

return visible;

3152

return visible;

3147

}

3153

}

3148

3154

3149

static void *mntns_get(struct task_struct *task)

3155

static void *mntns_get(struct task_struct *task)

3150

{

3156

{

3151

struct mnt_namespace *ns = NULL;

3157

struct mnt_namespace *ns = NULL;

3152

struct nsproxy *nsproxy;

3158

struct nsproxy *nsproxy;

3153

3159

3154

task_lock(task);

3160

task_lock(task);

3155

nsproxy = task->nsproxy;

3161

nsproxy = task->nsproxy;

3156

if (nsproxy) {

3162

if (nsproxy) {

3157

ns = nsproxy->mnt_ns;

3163

ns = nsproxy->mnt_ns;

3158

get_mnt_ns(ns);

3164

get_mnt_ns(ns);

3159

}

3165

}

3160

task_unlock(task);

3166

task_unlock(task);

3161

3167

3162

return ns;

3168

return ns;

3163

}

3169

}

3164

3170

3165

static void mntns_put(void *ns)

3171

static void mntns_put(void *ns)

3166

{

3172

{

3167

put_mnt_ns(ns);

3173

put_mnt_ns(ns);

3168

}

3174

}

3169

3175

3170

static int mntns_install(struct nsproxy *nsproxy, void *ns)

3176

static int mntns_install(struct nsproxy *nsproxy, void *ns)

3171

{

3177

{

3172

struct fs_struct *fs = current->fs;

3178

struct fs_struct *fs = current->fs;

3173

struct mnt_namespace *mnt_ns = ns;

3179

struct mnt_namespace *mnt_ns = ns;

3174

struct path root;

3180

struct path root;

3175

3181

3176

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3182

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3177

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3183

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3178

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3184

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3179

return -EPERM;

3185

return -EPERM;

3180

3186

3181

if (fs->users != 1)

3187

if (fs->users != 1)

3182

return -EINVAL;

3188

return -EINVAL;

3183

3189

3184

get_mnt_ns(mnt_ns);

3190

get_mnt_ns(mnt_ns);

3185

put_mnt_ns(nsproxy->mnt_ns);

3191

put_mnt_ns(nsproxy->mnt_ns);

3186

nsproxy->mnt_ns = mnt_ns;

3192

nsproxy->mnt_ns = mnt_ns;

3187

3193

3188

/* Find the root */

3194

/* Find the root */

3189

root.mnt = &mnt_ns->root->mnt;

3195

root.mnt = &mnt_ns->root->mnt;

3190

root.dentry = mnt_ns->root->mnt.mnt_root;

3196

root.dentry = mnt_ns->root->mnt.mnt_root;

3191

path_get(&root);

3197

path_get(&root);

3192

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3198

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3193

;

3199

;

3194

3200

3195

/* Update the pwd and root */

3201

/* Update the pwd and root */

3196

set_fs_pwd(fs, &root);

3202

set_fs_pwd(fs, &root);

3197

set_fs_root(fs, &root);

3203

set_fs_root(fs, &root);

3198

3204

3199

path_put(&root);

3205

path_put(&root);

3200

return 0;

3206

return 0;

3201

}

3207

}

3202

3208

3203

static unsigned int mntns_inum(void *ns)

3209

static unsigned int mntns_inum(void *ns)

3204

{

3210

{

3205

struct mnt_namespace *mnt_ns = ns;

3211

struct mnt_namespace *mnt_ns = ns;

3206

return mnt_ns->proc_inum;

3212

return mnt_ns->proc_inum;

3207

}

3213

}

3208

3214

3209

const struct proc_ns_operations mntns_operations = {

3215

const struct proc_ns_operations mntns_operations = {

3210

.name = "mnt",

3216

.name = "mnt",

3211

.type = CLONE_NEWNS,

3217

.type = CLONE_NEWNS,

3212

.get = mntns_get,

3218

.get = mntns_get,

3213

.put = mntns_put,

3219

.put = mntns_put,

3214

.install = mntns_install,

3220

.install = mntns_install,

3215

.inum = mntns_inum,

3221

.inum = mntns_inum,

3216

};

3222

};

3217

3223

GITLAB

mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount

 /*
  *  linux/fs/namespace.c
  *
  * (C) Copyright Al Viro 2000, 2001
  *	Released under GPL v2.
  *
  * Based on code from fs/super.c, copyright Linus Torvalds and others.
  * Heavily rewritten.
  */
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
 #include "pnode.h"
 #include "internal.h"
 static unsigned int m_hash_mask __read_mostly;
 static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
 static unsigned int mp_hash_shift __read_mostly;
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mhash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mhash_entries=", set_mhash_entries);
 static __initdata unsigned long mphash_entries;
 static int __init set_mphash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mphash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mphash_entries=", set_mphash_entries);
 static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 /*
  * vfsmount lock may be taken for read to prevent changes to the
  * vfsmount hash, ie. during mountpoint lookups or walking back
  * up the tree.
  *
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
 	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> m_hash_shift);
 	return &mount_hashtable[tmp & m_hash_mask];
 }
 static inline struct hlist_head *mp_hash(struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> mp_hash_shift);
 	return &mountpoint_hashtable[tmp & mp_hash_mask];
 }
 /*
  * allocation is serialized by namespace_sem, but we need the spinlock to
  * serialize with freeing.
  */
 static int mnt_alloc_id(struct mount *mnt)
 {
 	int res;
 retry:
 	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
 	spin_lock(&mnt_id_lock);
 	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
 	if (!res)
 		mnt_id_start = mnt->mnt_id + 1;
 	spin_unlock(&mnt_id_lock);
 	if (res == -EAGAIN)
 		goto retry;
 	return res;
 }
 static void mnt_free_id(struct mount *mnt)
 {
 	int id = mnt->mnt_id;
 	spin_lock(&mnt_id_lock);
 	ida_remove(&mnt_id_ida, id);
 	if (mnt_id_start > id)
 		mnt_id_start = id;
 	spin_unlock(&mnt_id_lock);
 }
 /*
  * Allocate a new peer group ID
  *
  * mnt_group_ida is protected by namespace_sem
  */
 static int mnt_alloc_group_id(struct mount *mnt)
 {
 	int res;
 	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
 		return -ENOMEM;
 	res = ida_get_new_above(&mnt_group_ida,
 				mnt_group_start,
 				&mnt->mnt_group_id);
 	if (!res)
 		mnt_group_start = mnt->mnt_group_id + 1;
 	return res;
 }
 /*
  * Release a peer group ID
  */
 void mnt_release_group_id(struct mount *mnt)
 {
 	int id = mnt->mnt_group_id;
 	ida_remove(&mnt_group_ida, id);
 	if (mnt_group_start > id)
 		mnt_group_start = id;
 	mnt->mnt_group_id = 0;
 }
 /*
  * vfsmount lock must be held for read
  */
 static inline void mnt_add_count(struct mount *mnt, int n)
 {
 #ifdef CONFIG_SMP
 	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
 #else
 	preempt_disable();
 	mnt->mnt_count += n;
 	preempt_enable();
 #endif
 }
 /*
  * vfsmount lock must be held for write
  */
 unsigned int mnt_get_count(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
 	}
 	return count;
 #else
 	return mnt->mnt_count;
 #endif
 }
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 		err = mnt_alloc_id(mnt);
 		if (err)
 			goto out_free_cache;
 		if (name) {
 			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
 #ifdef CONFIG_SMP
 		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
 #endif
 		INIT_HLIST_NODE(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
 	}
 	return mnt;
 #ifdef CONFIG_SMP
 out_free_devname:
 	kfree(mnt->mnt_devname);
 #endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
 	return NULL;
 }
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
  * We must keep track of when those operations start
  * (for permission checks) and when they end, so that
  * we can determine when writes are able to occur to
  * a filesystem.
  */
 /*
  * __mnt_is_readonly: check whether a mount is read-only
  * @mnt: the mount to check for its write status
  *
  * This shouldn't be used directly ouside of the VFS.
  * It does not guarantee that the filesystem will stay
  * r/w, just that it is right *now*.  This can not and
  * should not be used in place of IS_RDONLY(inode).
  * mnt_want/drop_write() will _keep_ the filesystem
  * r/w.
  */
 int __mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_flags & MNT_READONLY)
 		return 1;
 	if (mnt->mnt_sb->s_flags & MS_RDONLY)
 		return 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 static inline void mnt_inc_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers++;
 #endif
 }
 static inline void mnt_dec_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers--;
 #endif
 }
 static unsigned int mnt_get_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
 	}
 	return count;
 #else
 	return mnt->mnt_writers;
 #endif
 }
 static int mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_sb->s_readonly_remount)
 		return 1;
 	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
 	smp_rmb();
 	return __mnt_is_readonly(mnt);
 }
 /*
  * Most r/o & frozen checks on a fs are for operations that take discrete
  * amounts of time, like a write() or unlink().  We must keep track of when
  * those operations start (for permission checks) and when they end, so that we
  * can determine when writes are able to occur to a filesystem.
  */
 /**
  * __mnt_want_write - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
  * frozen. When the write operation is finished, __mnt_drop_write() must be
  * called. This is effectively a refcount.
  */
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
 	preempt_disable();
 	mnt_inc_writers(mnt);
 	/*
 	 * The store to mnt_inc_writers must be visible before we pass
 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
 	if (mnt_is_readonly(m)) {
 		mnt_dec_writers(mnt);
 		ret = -EROFS;
 	}
 	preempt_enable();
 	return ret;
 }
 /**
  * mnt_want_write - get write access to a mount
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mount is read-write, filesystem
  * is not frozen) before returning success.  When the write operation is
  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
  */
 int mnt_want_write(struct vfsmount *m)
 {
 	int ret;
 	sb_start_write(m->mnt_sb);
 	ret = __mnt_want_write(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 /**
  * mnt_clone_write - get write access to a mount
  * @mnt: the mount on which to take a write
  *
  * This is effectively like mnt_want_write, except
  * it must only be used to take an extra write reference
  * on a mountpoint that we already know has a write reference
  * on it. This allows some optimisation.
  *
  * After finished, mnt_drop_write must be called as usual to
  * drop the reference.
  */
 int mnt_clone_write(struct vfsmount *mnt)
 {
 	/* superblock may be r/o */
 	if (__mnt_is_readonly(mnt))
 		return -EROFS;
 	preempt_disable();
 	mnt_inc_writers(real_mount(mnt));
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mnt_clone_write);
 /**
  * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like __mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int __mnt_want_write_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
 		return __mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
 }
 /**
  * mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int mnt_want_write_file(struct file *file)
 {
 	int ret;
 	sb_start_write(file->f_path.mnt->mnt_sb);
 	ret = __mnt_want_write_file(file);
 	if (ret)
 		sb_end_write(file->f_path.mnt->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
  * __mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
  * __mnt_want_write() call above.
  */
 void __mnt_drop_write(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done performing writes to it and
  * also allows filesystem to be frozen again.  Must be matched with
  * mnt_want_write() call above.
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
 	__mnt_drop_write(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 void __mnt_drop_write_file(struct file *file)
 {
 	__mnt_drop_write(file->f_path.mnt);
 }
 void mnt_drop_write_file(struct file *file)
 {
 	mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 static int mnt_make_readonly(struct mount *mnt)
 {
 	int ret = 0;
 	lock_mount_hash();
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
 	 * should be visible before we do.
 	 */
 	smp_mb();
 	/*
 	 * With writers on hold, if this value is zero, then there are
 	 * definitely no active writers (although held writers may subsequently
 	 * increment the count, they'll have to wait, and decrement it after
 	 * seeing MNT_READONLY).
 	 *
 	 * It is OK to have counter incremented on one CPU and decremented on
 	 * another: the sum will add up correctly. The danger would be when we
 	 * sum up each counter, if we read a counter before it is incremented,
 	 * but then read another CPU's count which it has been subsequently
 	 * decremented from -- we would see more decrements than we should.
 	 * MNT_WRITE_HOLD protects against this scenario, because
 	 * mnt_want_write first increments count, then smp_mb, then spins on
 	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
 	 * we're counting up here.
 	 */
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
 	/*
 	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 	 * that become unheld will see MNT_READONLY.
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	unlock_mount_hash();
 	return ret;
 }
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
 }
 int sb_prepare_remount_readonly(struct super_block *sb)
 {
 	struct mount *mnt;
 	int err = 0;
 	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 	lock_mount_hash();
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
 			}
 		}
 	}
 	if (!err && atomic_long_read(&sb->s_remove_count))
 		err = -EBUSY;
 	if (!err) {
 		sb->s_readonly_remount = 1;
 		smp_wmb();
 	}
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
 	unlock_mount_hash();
 	return err;
 }
 static void free_vfsmnt(struct mount *mnt)
 {
 	kfree(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
 #endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 static void delayed_free_vfsmnt(struct rcu_head *head)
 {
 	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
 }
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
 	struct mount *mnt;
 	if (read_seqretry(&mount_lock, seq))
 		return false;
 	if (bastard == NULL)
 		return true;
 	mnt = real_mount(bastard);
 	mnt_add_count(mnt, 1);
 	if (likely(!read_seqretry(&mount_lock, seq)))
 		return true;
 	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
 		mnt_add_count(mnt, -1);
 		return false;
 	}
 	rcu_read_unlock();
 	mntput(bastard);
 	rcu_read_lock();
 	return false;
 }
 /*
  * find the first mount at @dentry on vfsmount @mnt.
  * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct hlist_head *head = m_hash(mnt, dentry);
 	struct mount *p;
 	hlist_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
 }
 /*
  * find the last mount at @dentry on vfsmount @mnt.
  * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct mount *p, *res;
 	res = p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
 		res = p;
 	}
 out:
 	return res;
 }
 /*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
  * following mounts:
  *
  * mount /dev/sda1 /mnt
  * mount /dev/sda2 /mnt
  * mount /dev/sda3 /mnt
  *
  * Then lookup_mnt() on the base /mnt dentry in the root mount will
  * return successively the root dentry and vfsmount of /dev/sda1, then
  * /dev/sda2, then /dev/sda3, then NULL.
  *
  * lookup_mnt takes a reference to the found vfsmount.
  */
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
 	struct vfsmount *m;
 	unsigned seq;
 	rcu_read_lock();
 	do {
 		seq = read_seqbegin(&mount_lock);
 		child_mnt = __lookup_mnt(path->mnt, path->dentry);
 		m = child_mnt ? &child_mnt->mnt : NULL;
 	} while (!legitimize_mnt(m, seq));
 	rcu_read_unlock();
 	return m;
 }
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
  *
  * The common case is dentries are not mountpoints at all and that
  * test is handled inline.  For the slow case when we are actually
  * dealing with a mountpoint of some kind, walk through all of the
  * mounts in the current mount namespace and test to see if the dentry
  * is a mountpoint.
  *
  * The mount_hashtable is not usable in the context because we
  * need to identify all mounts that may be in the current mount
  * namespace not just a mount that happens to have some specified
  * parent mount.
  */
 bool __is_local_mountpoint(struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool is_covered = false;
 	if (!d_mountpoint(dentry))
 		goto out;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
 	up_read(&namespace_sem);
 out:
 	return is_covered;
 }
 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	hlist_for_each_entry(mp, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
 			/* might be worth a WARN_ON() */
 			if (d_unlinked(dentry))
 				return ERR_PTR(-ENOENT);
 			mp->m_count++;
 			return mp;
 		}
 	}
 	return NULL;
 }
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	int ret;
 	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 	ret = d_set_mounted(dentry);
 	if (ret) {
 		kfree(mp);
 		return ERR_PTR(ret);
 	}
 	mp->m_dentry = dentry;
 	mp->m_count = 1;
 	hlist_add_head(&mp->m_hash, chain);
 	INIT_HLIST_HEAD(&mp->m_list);
 	return mp;
 }
 static void put_mountpoint(struct mountpoint *mp)
 {
 	if (!--mp->m_count) {
 		struct dentry *dentry = mp->m_dentry;
 		BUG_ON(!hlist_empty(&mp->m_list));
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
 		hlist_del(&mp->m_hash);
 		kfree(mp);
 	}
 }
 static inline int check_mnt(struct mount *mnt)
 {
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 /*
  * vfsmount lock must be held for write
  */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns) {
 		ns->event = ++event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns && ns->event != event) {
 		ns->event = event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void detach_mnt(struct mount *mnt, struct path *old_path)
 {
 	old_path->dentry = mnt->mnt_mountpoint;
 	old_path->mnt = &mnt->mnt_parent->mnt;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init_rcu(&mnt->mnt_hash);
 	hlist_del_init(&mnt->mnt_mp_list);
 	put_mountpoint(mnt->mnt_mp);
 	mnt->mnt_mp = NULL;
 }
 /*
  * vfsmount lock must be held for write
  */
 void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
 	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 /*
  * vfsmount lock must be held for write
  */
 static void attach_mnt(struct mount *mnt,
 			struct mount *parent,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
 	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 static void attach_shadowed(struct mount *mnt,
 			struct mount *parent,
 			struct mount *shadows)
 {
 	if (shadows) {
 		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
 		list_add(&mnt->mnt_child, &shadows->mnt_child);
 	} else {
 		hlist_add_head_rcu(&mnt->mnt_hash,
 				m_hash(&parent->mnt, mnt->mnt_mountpoint));
 		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void commit_tree(struct mount *mnt, struct mount *shadows)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
 	LIST_HEAD(head);
 	struct mnt_namespace *n = parent->mnt_ns;
 	BUG_ON(parent == mnt);
 	list_add_tail(&head, &mnt->mnt_list);
 	list_for_each_entry(m, &head, mnt_list)
 		m->mnt_ns = n;
 	list_splice(&head, n->list.prev);
 	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
 static struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
 	if (next == &p->mnt_mounts) {
 		while (1) {
 			if (p == root)
 				return NULL;
 			next = p->mnt_child.next;
 			if (next != &p->mnt_parent->mnt_mounts)
 				break;
 			p = p->mnt_parent;
 		}
 	}
 	return list_entry(next, struct mount, mnt_child);
 }
 static struct mount *skip_mnt_tree(struct mount *p)
 {
 	struct list_head *prev = p->mnt_mounts.prev;
 	while (prev != &p->mnt_mounts) {
 		p = list_entry(prev, struct mount, mnt_child);
 		prev = p->mnt_mounts.prev;
 	}
 	return p;
 }
 struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
 	struct mount *mnt;
 	struct dentry *root;
 	if (!type)
 		return ERR_PTR(-ENODEV);
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flags & MS_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 	root = mount_fs(type, flags, name, data);
 	if (IS_ERR(root)) {
 		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
 	}
 	mnt->mnt.mnt_root = root;
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
 	unlock_mount_hash();
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
 	struct super_block *sb = old->mnt.mnt_sb;
 	struct mount *mnt;
 	int err;
 	mnt = alloc_vfsmnt(old->mnt_devname);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
 	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
 		err = mnt_alloc_group_id(mnt);
 		if (err)
 			goto out_free;
 	}
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
 	/* Don't allow unprivileged users to change mount flags */
 	if (flag & CL_UNPRIVILEGED) {
 		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
 		if (mnt->mnt.mnt_flags & MNT_READONLY)
 			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 		if (mnt->mnt.mnt_flags & MNT_NODEV)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
 		if (mnt->mnt.mnt_flags & MNT_NOSUID)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
 		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
 	}
 	/* Don't allow unprivileged users to reveal what is under a mount */
 	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
 		mnt->mnt.mnt_flags |= MNT_LOCKED;
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	unlock_mount_hash();
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 		CLEAR_MNT_SHARED(mnt);
 	} else if (!(flag & CL_PRIVATE)) {
 		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
 			list_add(&mnt->mnt_share, &old->mnt_share);
 		if (IS_MNT_SLAVE(old))
 			list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
 	if (flag & CL_MAKE_SHARED)
 		set_mnt_shared(mnt);
 	/* stick the duplicate mount on the same expiry list
 	 * as the original if that was on one */
 	if (flag & CL_EXPIRE) {
 		if (!list_empty(&old->mnt_expire))
 			list_add(&mnt->mnt_expire, &old->mnt_expire);
 	}
 	return mnt;
  out_free:
 	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);
 }
 static void cleanup_mnt(struct mount *mnt)
 {
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
 	/*
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
 	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 static void __cleanup_mnt(struct rcu_head *head)
 {
 	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
 }
 static LLIST_HEAD(delayed_mntput_list);
 static void delayed_mntput(struct work_struct *unused)
 {
 	struct llist_node *node = llist_del_all(&delayed_mntput_list);
 	struct llist_node *next;
 	for (; node; node = next) {
 		next = llist_next(node);
 		cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
 	}
 }
 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 static void mntput_no_expire(struct mount *mnt)
 {
 	rcu_read_lock();
 	mnt_add_count(mnt, -1);
 	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
 		rcu_read_unlock();
 		return;
 	}
 	lock_mount_hash();
 	if (mnt_get_count(mnt)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	mnt->mnt.mnt_flags |= MNT_DOOMED;
 	rcu_read_unlock();
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
 		struct task_struct *task = current;
 		if (likely(!(task->flags & PF_KTHREAD))) {
 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
 			if (!task_work_add(task, &mnt->mnt_rcu, true))
 				return;
 		}
 		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
 			schedule_delayed_work(&delayed_mntput_work, 1);
 		return;
 	}
 	cleanup_mnt(mnt);
 }
 void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(m->mnt_expiry_mark))
 			m->mnt_expiry_mark = 0;
 		mntput_no_expire(m);
 	}
 }
 EXPORT_SYMBOL(mntput);
 struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
 		mnt_add_count(real_mount(mnt), 1);
 	return mnt;
 }
 EXPORT_SYMBOL(mntget);
 struct vfsmount *mnt_clone_internal(struct path *path)
 {
 	struct mount *p;
 	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
 	if (IS_ERR(p))
 		return ERR_CAST(p);
 	p->mnt.mnt_flags |= MNT_INTERNAL;
 	return &p->mnt;
 }
 static inline void mangle(struct seq_file *m, const char *s)
 {
 	seq_escape(m, s, " \t\n\\");
 }
 /*
  * Simple .show_options callback for filesystems which don't want to
  * implement more complex mount option showing.
  *
  * See also save_mount_options().
  */
 int generic_show_options(struct seq_file *m, struct dentry *root)
 {
 	const char *options;
 	rcu_read_lock();
 	options = rcu_dereference(root->d_sb->s_options);
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
 		mangle(m, options);
 	}
 	rcu_read_unlock();
 	return 0;
 }
 EXPORT_SYMBOL(generic_show_options);
 /*
  * If filesystem uses generic_show_options(), this function should be
  * called from the fill_super() callback.
  *
  * The .remount_fs callback usually needs to be handled in a special
  * way, to make sure, that previous options are not overwritten if the
  * remount fails.
  *
  * Also note, that if the filesystem's .remount_fs function doesn't
  * reset all options to their default value, but changes only newly
  * given options, then the displayed options will not reflect reality
  * any more.
  */
 void save_mount_options(struct super_block *sb, char *options)
 {
 	BUG_ON(sb->s_options);
 	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
 void replace_mount_options(struct super_block *sb, char *options)
 {
 	char *old = sb->s_options;
 	rcu_assign_pointer(sb->s_options, options);
 	if (old) {
 		synchronize_rcu();
 		kfree(old);
 	}
 }
 EXPORT_SYMBOL(replace_mount_options);
 #ifdef CONFIG_PROC_FS
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	down_read(&namespace_sem);
 	if (p->cached_event == p->ns->event) {
 		void *v = p->cached_mount;
 		if (*pos == p->cached_index)
 			return v;
 		if (*pos == p->cached_index + 1) {
 			v = seq_list_next(v, &p->ns->list, &p->cached_index);
 			return p->cached_mount = v;
 		}
 	}
 	p->cached_event = p->ns->event;
 	p->cached_mount = seq_list_start(&p->ns->list, *pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void m_stop(struct seq_file *m, void *v)
 {
 	up_read(&namespace_sem);
 }
 static int m_show(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	struct mount *r = list_entry(v, struct mount, mnt_list);
 	return p->show(m, &r->mnt);
 }
 const struct seq_operations mounts_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= m_show,
 };
 #endif  /* CONFIG_PROC_FS */
 /**
  * may_umount_tree - check if a mount tree is busy
  * @mnt: root of mount tree
  *
  * This is called to check if a tree of mounts has any
  * open files, pwds, chroots or sub mounts that are
  * busy.
  */
 int may_umount_tree(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int actual_refs = 0;
 	int minimum_refs = 0;
 	struct mount *p;
 	BUG_ON(!m);
 	/* write lock needed for mnt_get_count */
 	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
 	unlock_mount_hash();
 	if (actual_refs > minimum_refs)
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL(may_umount_tree);
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
  *
  * This is called to check if a mount point has any
  * open files, pwds, chroots or sub mounts. If the
  * mount has sub mounts this will return busy
  * regardless of whether the sub mounts are busy.
  *
  * Doesn't take quota and stuff into account. IOW, in some cases it will
  * give false negatives. The main reason why it's here is that we need
  * a non-destructive way to look for easily umountable filesystems.
  */
 int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
 	lock_mount_hash();
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
 	unlock_mount_hash();
 	up_read(&namespace_sem);
 	return ret;
 }
 EXPORT_SYMBOL(may_umount);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
 	struct mount *mnt;
 	struct hlist_head head = unmounted;
 	if (likely(hlist_empty(&head))) {
 		up_write(&namespace_sem);
 		return;
 	}
 	head.first->pprev = &head.first;
 	INIT_HLIST_HEAD(&unmounted);
 	/* undo decrements we'd done in umount_tree() */
 	hlist_for_each_entry(mnt, &head, mnt_hash)
 		if (mnt->mnt_ex_mountpoint.mnt)
 			mntget(mnt->mnt_ex_mountpoint.mnt);
 	up_write(&namespace_sem);
 	synchronize_rcu();
 	while (!hlist_empty(&head)) {
 		mnt = hlist_entry(head.first, struct mount, mnt_hash);
 		hlist_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_ex_mountpoint.mnt)
 			path_put(&mnt->mnt_ex_mountpoint);
 		mntput(&mnt->mnt);
 	}
 }
 static inline void namespace_lock(void)
 {
 	down_write(&namespace_sem);
 }
 /*
  * mount_lock must be held
  * namespace_sem must be held for write
  * how = 0 => just this tree, don't propagate
  * how = 1 => propagate; we know that nobody else has reference to any victims
  * how = 2 => lazy umount
  */
 void umount_tree(struct mount *mnt, int how)
 {
 	HLIST_HEAD(tmp_list);
 	struct mount *p;
 	struct mount *last = NULL;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		hlist_del_init_rcu(&p->mnt_hash);
 		hlist_add_head(&p->mnt_hash, &tmp_list);
 	}
 	hlist_for_each_entry(p, &tmp_list, mnt_hash)
 		list_del_init(&p->mnt_child);
 	if (how)
 		propagate_umount(&tmp_list);
 	hlist_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
 		if (how < 2)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		if (mnt_has_parent(p)) {
 			hlist_del_init(&p->mnt_mp_list);
 			put_mountpoint(p->mnt_mp);
 			mnt_add_count(p->mnt_parent, -1);
 			/* move the reference to mountpoint into ->mnt_ex_mountpoint */
 			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
 			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
 			p->mnt_mountpoint = p->mnt.mnt_root;
 			p->mnt_parent = p;
 			p->mnt_mp = NULL;
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 		last = p;
 	}
 	if (last) {
 		last->mnt_hash.next = unmounted.first;
 		if (unmounted.first)
 			unmounted.first->pprev = &last->mnt_hash.next;
 		unmounted.first = tmp_list.first;
 		unmounted.first->pprev = &unmounted.first;
 	}
 }
 static void shrink_submounts(struct mount *mnt);
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
 	int retval;
 	retval = security_sb_umount(&mnt->mnt, flags);
 	if (retval)
 		return retval;
 	/*
 	 * Allow userspace to request a mountpoint be expired rather than
 	 * unmounting unconditionally. Unmount only happens if:
 	 *  (1) the mark is already set (the mark is cleared by mntput())
 	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
 	 */
 	if (flags & MNT_EXPIRE) {
 		if (&mnt->mnt == current->fs->root.mnt ||
 		    flags & (MNT_FORCE | MNT_DETACH))
 			return -EINVAL;
 		/*
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
 		lock_mount_hash();
 		if (mnt_get_count(mnt) != 2) {
 			unlock_mount_hash();
 			return -EBUSY;
 		}
 		unlock_mount_hash();
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
 	}
 	/*
 	 * If we may have to abort operations to get out of this
 	 * mount, and they will themselves hold resources we must
 	 * allow the fs to do things. In the Unix tradition of
 	 * 'Gee thats tricky lets do it in userspace' the umount_begin
 	 * might fail to complete on the first run through as other tasks
 	 * must return, and the like. Thats for the mount program to worry
 	 * about for the moment.
 	 */
 	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
 		sb->s_op->umount_begin(sb);
 	}
 	/*
 	 * No sense to grab the lock for this test, but test itself looks
 	 * somewhat bogus. Suggestions for better replacement?
 	 * Ho-hum... In principle, we might treat that as umount + switch
 	 * to rootfs. GC would eventually take care of the old vfsmount.
 	 * Actually it makes sense, especially if rootfs would contain a
 	 * /reboot - static binary that would close all descriptors and
 	 * call reboot(9). Then init(8) could umount root and exec /reboot.
 	 */
 	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
 		/*
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
 		up_write(&sb->s_umount);
 		return retval;
 	}
 	namespace_lock();
 	lock_mount_hash();
 	event++;
 	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt, 2);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
 			if (!list_empty(&mnt->mnt_list))
 				umount_tree(mnt, 1);
 			retval = 0;
 		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 	return retval;
 }
 /*
  * __detach_mounts - lazily unmount all mounts on the specified dentry
  *
  * During unlink, rmdir, and d_drop it is possible to loose the path
  * to an existing mountpoint, and wind up leaking the mount.
  * detach_mounts allows lazily unmounting those mounts instead of
  * leaking them.
  *
  * The caller may hold dentry->d_inode->i_mutex.
  */
 void __detach_mounts(struct dentry *dentry)
 {
 	struct mountpoint *mp;
 	struct mount *mnt;
 	namespace_lock();
 	mp = lookup_mountpoint(dentry);
 	if (!mp)
 		goto out_unlock;
 	lock_mount_hash();
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		umount_tree(mnt, 2);
 	}
 	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
 	namespace_unlock();
 }
 /*
  * Is the caller allowed to modify his namespace?
  */
 static inline bool may_mount(void)
 {
 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
 }
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
  *
  * We now support a flag for forced unmount like the other 'big iron'
  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  */
 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	struct mount *mnt;
 	int retval;
 	int lookup_flags = 0;
 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
 		return -EINVAL;
 	if (!may_mount())
 		return -EPERM;
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
 	retval = -EINVAL;
 	if (path.dentry != path.mnt->mnt_root)
 		goto dput_and_out;
 	if (!check_mnt(mnt))
 		goto dput_and_out;
 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto dput_and_out;
 	retval = do_umount(mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
 	dput(path.dentry);
 	mntput_no_expire(mnt);
 out:
 	return retval;
 }
 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
 /*
  *	The 2.0 compatible umount. No flags.
  */
 SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
 	return sys_umount(name, 0);
 }
 #endif
 static bool is_mnt_ns_file(struct dentry *dentry)
 {
 	/* Is this a proxy for a mount namespace? */
 	struct inode *inode = dentry->d_inode;
 	struct proc_ns *ei;
 	if (!proc_ns_inode(inode))
 		return false;
 	ei = get_proc_ns(inode);
 	if (ei->ns_ops != &mntns_operations)
 		return false;
 	return true;
 }
 static bool mnt_ns_loop(struct dentry *dentry)
 {
 	/* Could bind mounting the mount namespace inode cause a
 	 * mount namespace loop?
 	 */
 	struct mnt_namespace *mnt_ns;
 	if (!is_mnt_ns_file(dentry))
 		return false;
 	mnt_ns = get_proc_ns(dentry->d_inode)->ns;
 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
 }
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct mount *res, *p, *q, *r, *parent;
 	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
 		return ERR_PTR(-EINVAL);
 	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
 		return ERR_PTR(-EINVAL);
 	res = q = clone_mnt(mnt, dentry, flag);
 	if (IS_ERR(q))
 		return q;
 	q->mnt.mnt_flags &= ~MNT_LOCKED;
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 	p = mnt;
 	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
 		struct mount *s;
 		if (!is_subdir(r->mnt_mountpoint, dentry))
 			continue;
 		for (s = r; s; s = next_mnt(s, r)) {
 			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			if (!(flag & CL_COPY_MNT_NS_FILE) &&
 			    is_mnt_ns_file(s->mnt.mnt_root)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			while (p != s->mnt_parent) {
 				p = p->mnt_parent;
 				q = q->mnt_parent;
 			}
 			p = s;
 			parent = q;
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (IS_ERR(q))
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			mnt_set_mountpoint(parent, p->mnt_mp, q);
 			if (!list_empty(&parent->mnt_mounts)) {
 				t = list_last_entry(&parent->mnt_mounts,
 					struct mount, mnt_child);
 				if (t->mnt_mp != p->mnt_mp)
 					t = NULL;
 			}
 			attach_shadowed(q, parent, t);
 			unlock_mount_hash();
 		}
 	}
 	return res;
 out:
 	if (res) {
 		lock_mount_hash();
 		umount_tree(res, 0);
 		unlock_mount_hash();
 	}
 	return q;
 }
 /* Caller should check returned pointer for errors */
 struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
 	namespace_lock();
 	tree = copy_tree(real_mount(path->mnt), path->dentry,
 			 CL_COPY_ALL | CL_PRIVATE);
 	namespace_unlock();
 	if (IS_ERR(tree))
 		return ERR_CAST(tree);
 	return &tree->mnt;
 }
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
 	umount_tree(real_mount(mnt), 0);
 	unlock_mount_hash();
 	namespace_unlock();
 }
 /**
  * clone_private_mount - create a private clone of a path
  *
  * This creates a new vfsmount, which will be the clone of @path.  The new will
  * not be attached anywhere in the namespace and will be private (i.e. changes
  * to the originating mount won't be propagated into this).
  *
  * Release with mntput().
  */
 struct vfsmount *clone_private_mount(struct path *path)
 {
 	struct mount *old_mnt = real_mount(path->mnt);
 	struct mount *new_mnt;
 	if (IS_MNT_UNBINDABLE(old_mnt))
 		return ERR_PTR(-EINVAL);
 	down_read(&namespace_sem);
 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
 	up_read(&namespace_sem);
 	if (IS_ERR(new_mnt))
 		return ERR_CAST(new_mnt);
 	return &new_mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(clone_private_mount);
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 		   struct vfsmount *root)
 {
 	struct mount *mnt;
 	int res = f(root, arg);
 	if (res)
 		return res;
 	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
 		res = f(&mnt->mnt, arg);
 		if (res)
 			return res;
 	}
 	return 0;
 }
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
 	struct mount *p;
 	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
 		if (p->mnt_group_id && !IS_MNT_SHARED(p))
 			mnt_release_group_id(p);
 	}
 }
 static int invent_group_ids(struct mount *mnt, bool recurse)
 {
 	struct mount *p;
 	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
 		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
 			int err = mnt_alloc_group_id(p);
 			if (err) {
 				cleanup_group_ids(mnt, p);
 				return err;
 			}
 		}
 	}
 	return 0;
 }
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
  *  		   store the parent mount and mountpoint dentry.
  *  		   (done when source_mnt is moved)
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
  * ---------------------------------------------------------------------------
  * |         BIND MOUNT OPERATION                                            |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
  * ***************************************************************************
  * A bind operation clones the source mount and mounts the clone on the
  * destination mount.
  *
  * (++)  the cloned mount is propagated to all the mounts in the propagation
  * 	 tree of the destination mount and the cloned mount is added to
  * 	 the peer group of the source mount.
  * (+)   the cloned mount is created under the destination mount and is marked
  *       as shared. The cloned mount is added to the peer group of the source
  *       mount.
  * (+++) the mount is propagated to all the mounts in the propagation tree
  *       of the destination mount and the cloned mount is made slave
  *       of the same master as that of the source mount. The cloned mount
  *       is marked as 'shared and slave'.
  * (*)   the cloned mount is made a slave of the same master as that of the
  * 	 source mount.
  *
  * ---------------------------------------------------------------------------
  * |         		MOVE MOUNT OPERATION                                 |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
  * ***************************************************************************
  *
  * (+)  the mount is moved to the destination. And is then propagated to
  * 	all the mounts in the propagation tree of the destination mount.
  * (+*)  the mount is moved to the destination.
  * (+++)  the mount is moved to the destination and is then propagated to
  * 	all the mounts belonging to the destination mount's propagation tree.
  * 	the mount is marked as 'shared and slave'.
  * (*)	the mount continues to be a slave at the new location.
  *
  * if the source mount is a tree, the operations explained above is
  * applied to each mount in the tree.
  * Must be called without spinlocks held, since this function can sleep
  * in allocations.
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
 			struct mount *dest_mnt,
 			struct mountpoint *dest_mp,
 			struct path *parent_path)
 {
 	HLIST_HEAD(tree_list);
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
 			goto out;
 		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
 		lock_mount_hash();
 		if (err)
 			goto out_cleanup_ids;
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
 			set_mnt_shared(p);
 	} else {
 		lock_mount_hash();
 	}
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt, NULL);
 	}
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
 		q = __lookup_mnt_last(&child->mnt_parent->mnt,
 				      child->mnt_mountpoint);
 		commit_tree(child, q);
 	}
 	unlock_mount_hash();
 	return 0;
  out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
 		umount_tree(child, 0);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	return err;
 }
 static struct mountpoint *lock_mount(struct path *path)
 {
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
 retry:
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (unlikely(cant_mount(dentry))) {
 		mutex_unlock(&dentry->d_inode->i_mutex);
 		return ERR_PTR(-ENOENT);
 	}
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
 		struct mountpoint *mp = lookup_mountpoint(dentry);
 		if (!mp)
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			return mp;
 		}
 		return mp;
 	}
 	namespace_unlock();
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	path_put(path);
 	path->mnt = mnt;
 	dentry = path->dentry = dget(mnt->mnt_root);
 	goto retry;
 }
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
 	put_mountpoint(where);
 	namespace_unlock();
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 {
 	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
 	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 	return attach_recursive_mnt(mnt, p, mp, NULL);
 }
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
 static int flags_to_propagation_type(int flags)
 {
 	int type = flags & ~(MS_REC | MS_SILENT);
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		return 0;
 	/* Only one propagation flag should be set */
 	if (!is_power_of_2(type))
 		return 0;
 	return type;
 }
 /*
  * recursively change the type of the mountpoint.
  */
 static int do_change_type(struct path *path, int flag)
 {
 	struct mount *m;
 	struct mount *mnt = real_mount(path->mnt);
 	int recurse = flag & MS_REC;
 	int type;
 	int err = 0;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	type = flags_to_propagation_type(flag);
 	if (!type)
 		return -EINVAL;
 	namespace_lock();
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
 			goto out_unlock;
 	}
 	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
 	unlock_mount_hash();
  out_unlock:
 	namespace_unlock();
 	return err;
 }
 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 {
 	struct mount *child;
 	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 		if (!is_subdir(child->mnt_mountpoint, dentry))
 			continue;
 		if (child->mnt.mnt_flags & MNT_LOCKED)
 			return true;
 	}
 	return false;
 }
 /*
  * do loopback mount.
  */
 static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
 	struct mount *mnt = NULL, *old, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
 	if (err)
 		return err;
 	err = -EINVAL;
 	if (mnt_ns_loop(old_path.dentry))
 		goto out;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old))
 		goto out2;
 	if (!check_mnt(parent) || !check_mnt(old))
 		goto out2;
 	if (!recurse && has_locked_children(old, old_path.dentry))
 		goto out2;
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
 	else
 		mnt = clone_mnt(old, old_path.dentry, 0);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
 		umount_tree(mnt, 0);
 		unlock_mount_hash();
 	}
 out2:
 	unlock_mount(mp);
 out:
 	path_put(&old_path);
 	return err;
 }
 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 {
 	int error = 0;
 	int readonly_request = 0;
 	if (ms_flags & MS_RDONLY)
 		readonly_request = 1;
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
 		__mnt_unmake_readonly(real_mount(mnt));
 	return error;
 }
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
 static int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
 	if (!check_mnt(mnt))
 		return -EINVAL;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	/* Don't allow changing of locked mnt flags.
 	 *
 	 * No locks need to be held here while testing the various
 	 * MNT_LOCK flags because those flags can never be cleared
 	 * once they are set.
 	 */
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
 	    !(mnt_flags & MNT_READONLY)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 	    !(mnt_flags & MNT_NODEV)) {
-		return -EPERM;
+		/* Was the nodev implicitly added in mount? */
+		if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+		    !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+			mnt_flags |= MNT_NODEV;
+		} else {
+			return -EPERM;
+		}
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 	    !(mnt_flags & MNT_NOSUID)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
 	    !(mnt_flags & MNT_NOEXEC)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
 	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
 		return -EPERM;
 	}
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
 		unlock_mount_hash();
 	}
 	up_write(&sb->s_umount);
 	return err;
 }
 static inline int tree_contains_unbindable(struct mount *mnt)
 {
 	struct mount *p;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		if (IS_MNT_UNBINDABLE(p))
 			return 1;
 	}
 	return 0;
 }
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
 	struct mount *p;
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	p = real_mount(path->mnt);
 	err = -EINVAL;
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 	if (old->mnt.mnt_flags & MNT_LOCKED)
 		goto out1;
 	err = -EINVAL;
 	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
 	if (!mnt_has_parent(old))
 		goto out1;
 	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
 	      S_ISDIR(old_path.dentry->d_inode->i_mode))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
 	if (IS_MNT_SHARED(old->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
 		goto out1;
 	err = -ELOOP;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
 			goto out1;
 	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
 	if (err)
 		goto out1;
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
 out1:
 	unlock_mount(mp);
 out:
 	if (!err)
 		path_put(&parent_path);
 	path_put(&old_path);
 	return err;
 }
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
 	int err;
 	const char *subtype = strchr(fstype, '.');
 	if (subtype) {
 		subtype++;
 		err = -EINVAL;
 		if (!subtype[0])
 			goto err;
 	} else
 		subtype = "";
 	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
 	err = -ENOMEM;
 	if (!mnt->mnt_sb->s_subtype)
 		goto err;
 	return mnt;
  err:
 	mntput(mnt);
 	return ERR_PTR(err);
 }
 /*
  * add a mount into a namespace's mount tree
  */
 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 {
 	struct mountpoint *mp;
 	struct mount *parent;
 	int err;
 	mnt_flags &= ~MNT_INTERNAL_FLAGS;
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
 		return PTR_ERR(mp);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (unlikely(!check_mnt(parent))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
 			goto unlock;
 		/* ... and for those we'd better have mountpoint still alive */
 		if (!parent->mnt_ns)
 			goto unlock;
 	}
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
 	    path->mnt->mnt_root == path->dentry)
 		goto unlock;
 	err = -EINVAL;
 	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
 		goto unlock;
 	newmnt->mnt.mnt_flags = mnt_flags;
 	err = graft_tree(newmnt, parent, mp);
 unlock:
 	unlock_mount(mp);
 	return err;
 }
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
 static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 	if (!fstype)
 		return -EINVAL;
 	type = get_fs_type(fstype);
 	if (!type)
 		return -ENODEV;
 	if (user_ns != &init_user_ns) {
 		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
 			put_filesystem(type);
 			return -EPERM;
 		}
 		/* Only in special cases allow devices from mounts
 		 * created outside the initial user namespace.
 		 */
 		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
 	}
 	mnt = vfs_kern_mount(type, flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
 		mnt = fs_set_subtype(mnt, fstype);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
 	return err;
 }
 int finish_automount(struct vfsmount *m, struct path *path)
 {
 	struct mount *mnt = real_mount(m);
 	int err;
 	/* The new mount record should have at least 2 refs to prevent it being
 	 * expired before we get a chance to add it
 	 */
 	BUG_ON(mnt_get_count(mnt) < 2);
 	if (m->mnt_sb == path->mnt->mnt_sb &&
 	    m->mnt_root == path->dentry) {
 		err = -ELOOP;
 		goto fail;
 	}
 	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	if (!err)
 		return 0;
 fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();
 		list_del_init(&mnt->mnt_expire);
 		namespace_unlock();
 	}
 	mntput(m);
 	mntput(m);
 	return err;
 }
 /**
  * mnt_set_expiry - Put a mount on an expiration list
  * @mnt: The mount to list.
  * @expiry_list: The list to add the mount to.
  */
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	namespace_lock();
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 	namespace_unlock();
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
  * here
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
 	struct mount *mnt, *next;
 	LIST_HEAD(graveyard);
 	if (list_empty(mounts))
 		return;
 	namespace_lock();
 	lock_mount_hash();
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
 	 * - only referenced by its parent vfsmount
 	 * - still marked for expiry (marked on the last call here; marks are
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
 			propagate_mount_busy(mnt, 1))
 			continue;
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1);
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 }
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
 /*
  * Ripoff of 'select_parent()'
  *
  * search the list of submounts for a given mountpoint, and move any
  * shrinkable submounts to the 'graveyard' list.
  */
 static int select_submounts(struct mount *parent, struct list_head *graveyard)
 {
 	struct mount *this_parent = parent;
 	struct list_head *next;
 	int found = 0;
 repeat:
 	next = this_parent->mnt_mounts.next;
 resume:
 	while (next != &this_parent->mnt_mounts) {
 		struct list_head *tmp = next;
 		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
 		next = tmp->next;
 		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
 			continue;
 		/*
 		 * Descend a level if the d_mounts list is non-empty.
 		 */
 		if (!list_empty(&mnt->mnt_mounts)) {
 			this_parent = mnt;
 			goto repeat;
 		}
 		if (!propagate_mount_busy(mnt, 1)) {
 			list_move_tail(&mnt->mnt_expire, graveyard);
 			found++;
 		}
 	}
 	/*
 	 * All done at this level ... ascend and resume the search
 	 */
 	if (this_parent != parent) {
 		next = this_parent->mnt_child.next;
 		this_parent = this_parent->mnt_parent;
 		goto resume;
 	}
 	return found;
 }
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
  * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
 	LIST_HEAD(graveyard);
 	struct mount *m;
 	/* extract submounts of 'mountpoint' from the expiration list */
 	while (select_submounts(mnt, &graveyard)) {
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct mount,
 						mnt_expire);
 			touch_mnt_namespace(m->mnt_ns);
 			umount_tree(m, 1);
 		}
 	}
 }
 /*
  * Some copy_from_user() implementations do not return the exact number of
  * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
  * Note that this function differs from copy_from_user() in that it will oops
  * on bad values of `to', rather than returning a short copy.
  */
 static long exact_copy_from_user(void *to, const void __user * from,
 				 unsigned long n)
 {
 	char *t = to;
 	const char __user *f = from;
 	char c;
 	if (!access_ok(VERIFY_READ, from, n))
 		return n;
 	while (n) {
 		if (__get_user(c, f)) {
 			memset(t, 0, n);
 			break;
 		}
 		*t++ = c;
 		f++;
 		n--;
 	}
 	return n;
 }
 int copy_mount_options(const void __user * data, unsigned long *where)
 {
 	int i;
 	unsigned long page;
 	unsigned long size;
 	*where = 0;
 	if (!data)
 		return 0;
 	if (!(page = __get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 	/* We only care that *some* data at the address the user
 	 * gave us is valid.  Just in case, we'll zero
 	 * the remainder of the page.
 	 */
 	/* copy_from_user cannot cross TASK_SIZE ! */
 	size = TASK_SIZE - (unsigned long)data;
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 	i = size - exact_copy_from_user((void *)page, data, size);
 	if (!i) {
 		free_page(page);
 		return -EFAULT;
 	}
 	if (i != PAGE_SIZE)
 		memset((char *)page + i, 0, PAGE_SIZE - i);
 	*where = page;
 	return 0;
 }
 char *copy_mount_string(const void __user *data)
 {
 	return data ? strndup_user(data, PAGE_SIZE) : NULL;
 }
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  *
  * data is a (void *) that can point to any structure up to
  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  * information (or be NULL).
  *
  * Pre-0.97 versions of mount() didn't have a flags word.
  * When the flags word was introduced its top half was required
  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  * Therefore, if this magic number is present, it carries no information
  * and must be discarded.
  */
 long do_mount(const char *dev_name, const char __user *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
 	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
 		flags &= ~MS_MGC_MSK;
 	/* Basic sanity checks */
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 	/* ... and get the mountpoint */
 	retval = user_path(dir_name, &path);
 	if (retval)
 		return retval;
 	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
 	if (!retval && !may_mount())
 		retval = -EPERM;
 	if (retval)
 		goto dput_out;
 	/* Default to relatime unless overriden */
 	if (!(flags & MS_NOATIME))
 		mnt_flags |= MNT_RELATIME;
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
 	if (flags & MS_NODEV)
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
 	if (flags & MS_NOATIME)
 		mnt_flags |= MNT_NOATIME;
 	if (flags & MS_NODIRATIME)
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_STRICTATIME)
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 	/* The default atime for remount is preservation */
 	if ((flags & MS_REMOUNT) &&
 	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
 		       MS_STRICTATIME)) == 0)) {
 		mnt_flags &= ~MNT_ATIME_MASK;
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
 	path_put(&path);
 	return retval;
 }
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	proc_free_inum(ns->proc_inum);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
 /*
  * Assign a sequence number so we can detect when we attempt to bind
  * mount a reference to an older mount namespace into the current
  * mount namespace, preventing reference counting loops.  A 64bit
  * number incrementing at 10Ghz will take 12,427 years to wrap which
  * is effectively never, so we can ignore the possibility.
  */
 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
 	int ret;
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
 	ret = proc_alloc_inum(&new_ns->proc_inum);
 	if (ret) {
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
 	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
 	struct mount *old;
 	struct mount *new;
 	int copy_flags;
 	BUG_ON(!ns);
 	if (likely(!(flags & CLONE_NEWNS))) {
 		get_mnt_ns(ns);
 		return ns;
 	}
 	old = ns->root;
 	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
 	namespace_lock();
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
 		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
 	list_add_tail(&new_ns->list, &new->mnt_list);
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
 	 * as belonging to new namespace.  We have already acquired a private
 	 * fs_struct, so tsk->fs->lock is not needed.
 	 */
 	p = old;
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
 				rootmnt = &p->mnt;
 			}
 			if (&p->mnt == new_fs->pwd.mnt) {
 				new_fs->pwd.mnt = mntget(&q->mnt);
 				pwdmnt = &p->mnt;
 			}
 		}
 		p = next_mnt(p, old);
 		q = next_mnt(q, new);
 		if (!q)
 			break;
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(p, old);
 	}
 	namespace_unlock();
 	if (rootmnt)
 		mntput(rootmnt);
 	if (pwdmnt)
 		mntput(pwdmnt);
 	return new_ns;
 }
 /**
  * create_mnt_ns - creates a private namespace and adds a root filesystem
  * @mnt: pointer to the new root filesystem mountpoint
  */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
 	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
 		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
 	}
 	return new_ns;
 }
 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
 {
 	struct mnt_namespace *ns;
 	struct super_block *s;
 	struct path path;
 	int err;
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		return ERR_CAST(ns);
 	err = vfs_path_lookup(mnt->mnt_root, mnt,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
 	put_mnt_ns(ns);
 	if (err)
 		return ERR_PTR(err);
 	/* trade a vfsmount reference for active sb one */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
 	mntput(path.mnt);
 	/* lock the sucker */
 	down_write(&s->s_umount);
 	/* ... and return the root of (sub)tree on it */
 	return path.dentry;
 }
 EXPORT_SYMBOL(mount_subtree);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
 	int ret;
 	char *kernel_type;
 	char *kernel_dev;
 	unsigned long data_page;
 	kernel_type = copy_mount_string(type);
 	ret = PTR_ERR(kernel_type);
 	if (IS_ERR(kernel_type))
 		goto out_type;
 	kernel_dev = copy_mount_string(dev_name);
 	ret = PTR_ERR(kernel_dev);
 	if (IS_ERR(kernel_dev))
 		goto out_dev;
 	ret = copy_mount_options(data, &data_page);
 	if (ret < 0)
 		goto out_data;
 	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
 		(void *) data_page);
 	free_page(data_page);
 out_data:
 	kfree(kernel_dev);
 out_dev:
 	kfree(kernel_type);
 out_type:
 	return ret;
 }
 /*
  * Return true if path is reachable from root
  *
  * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
 {
 	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
 		dentry = mnt->mnt_mountpoint;
 		mnt = mnt->mnt_parent;
 	}
 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
 }
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
 	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
 	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
  * makes new_root as the new root file system of the current process, and sets
  * root/cwd of all processes which had them on the current root to new_root.
  *
  * Restrictions:
  * The new_root and put_old must be directories, and  must not be on the
  * same file  system as the current process root. The put_old  must  be
  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
  * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
  * in this situation.
  *
  * Notes:
  *  - we don't move root/cwd if they are not at the root (reason: if something
  *    cared enough to change them, it's probably wrong to force them elsewhere)
  *  - it's okay to pick a root that isn't the root of a file system, e.g.
  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		const char __user *, put_old)
 {
 	struct path new, old, parent_path, root_parent, root;
 	struct mount *new_mnt, *root_mnt, *old_mnt;
 	struct mountpoint *old_mp, *root_mp;
 	int error;
 	if (!may_mount())
 		return -EPERM;
 	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
 	error = user_path_dir(put_old, &old);
 	if (error)
 		goto out1;
 	error = security_sb_pivotroot(&old, &new);
 	if (error)
 		goto out2;
 	get_fs_root(current->fs, &root);
 	old_mp = lock_mount(&old);
 	error = PTR_ERR(old_mp);
 	if (IS_ERR(old_mp))
 		goto out3;
 	error = -EINVAL;
 	new_mnt = real_mount(new.mnt);
 	root_mnt = real_mount(root.mnt);
 	old_mnt = real_mount(old.mnt);
 	if (IS_MNT_SHARED(old_mnt) ||
 		IS_MNT_SHARED(new_mnt->mnt_parent) ||
 		IS_MNT_SHARED(root_mnt->mnt_parent))
 		goto out4;
 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
 		goto out4;
 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto out4;
 	error = -ENOENT;
 	if (d_unlinked(new.dentry))
 		goto out4;
 	error = -EBUSY;
 	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		goto out4; /* not attached */
 	root_mp = root_mnt->mnt_mp;
 	if (new.mnt->mnt_root != new.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
 	/* make certain new is below the root */
 	if (!is_path_reachable(new_mnt, new.dentry, &root))
 		goto out4;
 	root_mp->m_count++; /* pin it so it won't go away */
 	lock_mount_hash();
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
 	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
 	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
 	if (!error) {
 		path_put(&root_parent);
 		path_put(&parent_path);
 	}
 out3:
 	path_put(&root);
 out2:
 	path_put(&old);
 out1:
 	path_put(&new);
 out0:
 	return error;
 }
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
 	struct mnt_namespace *ns;
 	struct path root;
 	struct file_system_type *type;
 	type = get_fs_type("rootfs");
 	if (!type)
 		panic("Can't find rootfs type");
 	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 	root.mnt = mnt;
 	root.dentry = mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 }
 void __init mnt_init(void)
 {
 	unsigned u;
 	int err;
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
 				mhash_entries, 19,
 				0,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
 				sizeof(struct hlist_head),
 				mphash_entries, 19,
 				0,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 	for (u = 0; u <= m_hash_mask; u++)
 		INIT_HLIST_HEAD(&mount_hashtable[u]);
 	for (u = 0; u <= mp_hash_mask; u++)
 		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
 	kernfs_init();
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
 			__func__, err);
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
 	init_rootfs();
 	init_mount_tree();
 }
 void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	drop_collected_mounts(&ns->root->mnt);
 	free_mnt_ns(ns);
 }
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 {
 	struct vfsmount *mnt;
 	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
 		 * we unmount before file sys is unregistered
 		*/
 		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
 	}
 	return mnt;
 }
 EXPORT_SYMBOL_GPL(kern_mount_data);
 void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
 		real_mount(mnt)->mnt_ns = NULL;
 		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
 EXPORT_SYMBOL(kern_unmount);
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
 }
 bool current_chrooted(void)
 {
 	/* Does the current process have a non-standard root */
 	struct path ns_root;
 	struct path fs_root;
 	bool chrooted;
 	/* Find the namespace root */
 	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
 	ns_root.dentry = ns_root.mnt->mnt_root;
 	path_get(&ns_root);
 	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
 		;
 	get_fs_root(current->fs, &fs_root);
 	chrooted = !path_equal(&fs_root, &ns_root);
 	path_put(&fs_root);
 	path_put(&ns_root);
 	return chrooted;
 }
 bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool visible = false;
 	if (unlikely(!ns))
 		return false;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		if (mnt->mnt.mnt_sb->s_type != type)
 			continue;
 		/* This mount is not fully visible if there are any child mounts
 		 * that cover anything except for empty directories.
 		 */
 		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 			struct inode *inode = child->mnt_mountpoint->d_inode;
 			if (!S_ISDIR(inode->i_mode))
 				goto next;
 			if (inode->i_nlink > 2)
 				goto next;
 		}
 		visible = true;
 		goto found;
 	next:	;
 	}
 found:
 	up_read(&namespace_sem);
 	return visible;
 }
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 	task_lock(task);
 	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->mnt_ns;
 		get_mnt_ns(ns);
 	}
 	task_unlock(task);
 	return ns;
 }
 static void mntns_put(void *ns)
 {
 	put_mnt_ns(ns);
 }
 static int mntns_install(struct nsproxy *nsproxy, void *ns)
 {
 	struct fs_struct *fs = current->fs;
 	struct mnt_namespace *mnt_ns = ns;
 	struct path root;
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 	if (fs->users != 1)
 		return -EINVAL;
 	get_mnt_ns(mnt_ns);
 	put_mnt_ns(nsproxy->mnt_ns);
 	nsproxy->mnt_ns = mnt_ns;
 	/* Find the root */
 	root.mnt    = &mnt_ns->root->mnt;
 	root.dentry = mnt_ns->root->mnt.mnt_root;
 	path_get(&root);
 	while(d_mountpoint(root.dentry) && follow_down_one(&root))
 		;
 	/* Update the pwd and root */
 	set_fs_pwd(fs, &root);
 	set_fs_root(fs, &root);
 	path_put(&root);
 	return 0;
 }
 static unsigned int mntns_inum(void *ns)
 {
 	struct mnt_namespace *mnt_ns = ns;
 	return mnt_ns->proc_inum;
 }
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
 	.inum		= mntns_inum,
 };