Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/fs/namespace.c

2

* linux/fs/namespace.c

3

*

3

*

4

* (C) Copyright Al Viro 2000, 2001

4

* (C) Copyright Al Viro 2000, 2001

5

* Released under GPL v2.

5

* Released under GPL v2.

6

*

6

*

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

7

* Based on code from fs/super.c, copyright Linus Torvalds and others.

8

* Heavily rewritten.

8

* Heavily rewritten.

9

*/

9

*/

10

11

#include <linux/syscalls.h>

11

#include <linux/syscalls.h>

12

#include <linux/export.h>

12

#include <linux/export.h>

13

#include <linux/capability.h>

13

#include <linux/capability.h>

14

#include <linux/mnt_namespace.h>

14

#include <linux/mnt_namespace.h>

15

#include <linux/user_namespace.h>

15

#include <linux/user_namespace.h>

16

#include <linux/namei.h>

16

#include <linux/namei.h>

17

#include <linux/security.h>

17

#include <linux/security.h>

18

#include <linux/idr.h>

18

#include <linux/idr.h>

19

#include <linux/init.h> /* init_rootfs */

19

#include <linux/init.h> /* init_rootfs */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

20

#include <linux/fs_struct.h> /* get_fs_root et.al. */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

21

#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */

22

#include <linux/uaccess.h>

22

#include <linux/uaccess.h>

23

#include <linux/proc_ns.h>

23

#include <linux/proc_ns.h>

24

#include <linux/magic.h>

24

#include <linux/magic.h>

25

#include <linux/bootmem.h>

25

#include <linux/bootmem.h>

26

#include <linux/task_work.h>

26

#include <linux/task_work.h>

27

#include "pnode.h"

27

#include "pnode.h"

28

#include "internal.h"

28

#include "internal.h"

29

30

static unsigned int m_hash_mask __read_mostly;

30

static unsigned int m_hash_mask __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

31

static unsigned int m_hash_shift __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

32

static unsigned int mp_hash_mask __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

33

static unsigned int mp_hash_shift __read_mostly;

34

35

static __initdata unsigned long mhash_entries;

35

static __initdata unsigned long mhash_entries;

36

static int __init set_mhash_entries(char *str)

36

static int __init set_mhash_entries(char *str)

37

{

37

{

38

if (!str)

38

if (!str)

39

return 0;

39

return 0;

40

mhash_entries = simple_strtoul(str, &str, 0);

40

mhash_entries = simple_strtoul(str, &str, 0);

41

return 1;

41

return 1;

42

}

42

}

43

__setup("mhash_entries=", set_mhash_entries);

43

__setup("mhash_entries=", set_mhash_entries);

44

45

static __initdata unsigned long mphash_entries;

45

static __initdata unsigned long mphash_entries;

46

static int __init set_mphash_entries(char *str)

46

static int __init set_mphash_entries(char *str)

47

{

47

{

48

if (!str)

48

if (!str)

49

return 0;

49

return 0;

50

mphash_entries = simple_strtoul(str, &str, 0);

50

mphash_entries = simple_strtoul(str, &str, 0);

51

return 1;

51

return 1;

52

}

52

}

53

__setup("mphash_entries=", set_mphash_entries);

53

__setup("mphash_entries=", set_mphash_entries);

54

55

static u64 event;

55

static u64 event;

56

static DEFINE_IDA(mnt_id_ida);

56

static DEFINE_IDA(mnt_id_ida);

57

static DEFINE_IDA(mnt_group_ida);

57

static DEFINE_IDA(mnt_group_ida);

58

static DEFINE_SPINLOCK(mnt_id_lock);

58

static DEFINE_SPINLOCK(mnt_id_lock);

59

static int mnt_id_start = 0;

59

static int mnt_id_start = 0;

60

static int mnt_group_start = 1;

60

static int mnt_group_start = 1;

61

62

static struct hlist_head *mount_hashtable __read_mostly;

62

static struct hlist_head *mount_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

63

static struct hlist_head *mountpoint_hashtable __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

64

static struct kmem_cache *mnt_cache __read_mostly;

65

static DECLARE_RWSEM(namespace_sem);

65

static DECLARE_RWSEM(namespace_sem);

66

67

/* /sys/fs */

67

/* /sys/fs */

68

struct kobject *fs_kobj;

68

struct kobject *fs_kobj;

69

EXPORT_SYMBOL_GPL(fs_kobj);

69

EXPORT_SYMBOL_GPL(fs_kobj);

70

71

/*

71

/*

72

* vfsmount lock may be taken for read to prevent changes to the

72

* vfsmount lock may be taken for read to prevent changes to the

73

* vfsmount hash, ie. during mountpoint lookups or walking back

73

* vfsmount hash, ie. during mountpoint lookups or walking back

74

* up the tree.

74

* up the tree.

75

*

75

*

76

* It should be taken for write in all cases where the vfsmount

76

* It should be taken for write in all cases where the vfsmount

77

* tree or hash is modified or when a vfsmount structure is modified.

77

* tree or hash is modified or when a vfsmount structure is modified.

78

*/

78

*/

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

79

__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

80

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

81

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)

82

{

82

{

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

83

unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

84

tmp += ((unsigned long)dentry / L1_CACHE_BYTES);

85

tmp = tmp + (tmp >> m_hash_shift);

85

tmp = tmp + (tmp >> m_hash_shift);

86

return &mount_hashtable[tmp & m_hash_mask];

86

return &mount_hashtable[tmp & m_hash_mask];

87

}

87

}

88

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

89

static inline struct hlist_head *mp_hash(struct dentry *dentry)

90

{

90

{

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

91

unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);

92

tmp = tmp + (tmp >> mp_hash_shift);

92

tmp = tmp + (tmp >> mp_hash_shift);

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

93

return &mountpoint_hashtable[tmp & mp_hash_mask];

94

}

94

}

95

96

/*

96

/*

97

* allocation is serialized by namespace_sem, but we need the spinlock to

97

* allocation is serialized by namespace_sem, but we need the spinlock to

98

* serialize with freeing.

98

* serialize with freeing.

99

*/

99

*/

100

static int mnt_alloc_id(struct mount *mnt)

100

static int mnt_alloc_id(struct mount *mnt)

101

{

101

{

102

int res;

102

int res;

103

104

retry:

104

retry:

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

105

ida_pre_get(&mnt_id_ida, GFP_KERNEL);

106

spin_lock(&mnt_id_lock);

106

spin_lock(&mnt_id_lock);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

107

res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);

108

if (!res)

108

if (!res)

109

mnt_id_start = mnt->mnt_id + 1;

109

mnt_id_start = mnt->mnt_id + 1;

110

spin_unlock(&mnt_id_lock);

110

spin_unlock(&mnt_id_lock);

111

if (res == -EAGAIN)

111

if (res == -EAGAIN)

112

goto retry;

112

goto retry;

113

114

return res;

114

return res;

115

}

115

}

116

117

static void mnt_free_id(struct mount *mnt)

117

static void mnt_free_id(struct mount *mnt)

118

{

118

{

119

int id = mnt->mnt_id;

119

int id = mnt->mnt_id;

120

spin_lock(&mnt_id_lock);

120

spin_lock(&mnt_id_lock);

121

ida_remove(&mnt_id_ida, id);

121

ida_remove(&mnt_id_ida, id);

122

if (mnt_id_start > id)

122

if (mnt_id_start > id)

123

mnt_id_start = id;

123

mnt_id_start = id;

124

spin_unlock(&mnt_id_lock);

124

spin_unlock(&mnt_id_lock);

125

}

125

}

126

127

/*

127

/*

128

* Allocate a new peer group ID

128

* Allocate a new peer group ID

129

*

129

*

130

* mnt_group_ida is protected by namespace_sem

130

* mnt_group_ida is protected by namespace_sem

131

*/

131

*/

132

static int mnt_alloc_group_id(struct mount *mnt)

132

static int mnt_alloc_group_id(struct mount *mnt)

133

{

133

{

134

int res;

134

int res;

135

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

136

if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))

137

return -ENOMEM;

137

return -ENOMEM;

138

139

res = ida_get_new_above(&mnt_group_ida,

139

res = ida_get_new_above(&mnt_group_ida,

140

mnt_group_start,

140

mnt_group_start,

141

&mnt->mnt_group_id);

141

&mnt->mnt_group_id);

142

if (!res)

142

if (!res)

143

mnt_group_start = mnt->mnt_group_id + 1;

143

mnt_group_start = mnt->mnt_group_id + 1;

144

145

return res;

145

return res;

146

}

146

}

147

148

/*

148

/*

149

* Release a peer group ID

149

* Release a peer group ID

150

*/

150

*/

151

void mnt_release_group_id(struct mount *mnt)

151

void mnt_release_group_id(struct mount *mnt)

152

{

152

{

153

int id = mnt->mnt_group_id;

153

int id = mnt->mnt_group_id;

154

ida_remove(&mnt_group_ida, id);

154

ida_remove(&mnt_group_ida, id);

155

if (mnt_group_start > id)

155

if (mnt_group_start > id)

156

mnt_group_start = id;

156

mnt_group_start = id;

157

mnt->mnt_group_id = 0;

157

mnt->mnt_group_id = 0;

158

}

158

}

159

160

/*

160

/*

161

* vfsmount lock must be held for read

161

* vfsmount lock must be held for read

162

*/

162

*/

163

static inline void mnt_add_count(struct mount *mnt, int n)

163

static inline void mnt_add_count(struct mount *mnt, int n)

164

{

164

{

165

#ifdef CONFIG_SMP

165

#ifdef CONFIG_SMP

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

166

this_cpu_add(mnt->mnt_pcp->mnt_count, n);

167

#else

167

#else

168

preempt_disable();

168

preempt_disable();

169

mnt->mnt_count += n;

169

mnt->mnt_count += n;

170

preempt_enable();

170

preempt_enable();

171

#endif

171

#endif

172

}

172

}

173

174

/*

174

/*

175

* vfsmount lock must be held for write

175

* vfsmount lock must be held for write

176

*/

176

*/

177

unsigned int mnt_get_count(struct mount *mnt)

177

unsigned int mnt_get_count(struct mount *mnt)

178

{

178

{

179

#ifdef CONFIG_SMP

179

#ifdef CONFIG_SMP

180

unsigned int count = 0;

180

unsigned int count = 0;

181

int cpu;

181

int cpu;

182

183

for_each_possible_cpu(cpu) {

183

for_each_possible_cpu(cpu) {

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

184

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;

185

}

185

}

186

187

return count;

187

return count;

188

#else

188

#else

189

return mnt->mnt_count;

189

return mnt->mnt_count;

190

#endif

190

#endif

191

}

191

}

192

193

static struct mount *alloc_vfsmnt(const char *name)

193

static struct mount *alloc_vfsmnt(const char *name)

194

{

194

{

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

195

struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);

196

if (mnt) {

196

if (mnt) {

197

int err;

197

int err;

198

199

err = mnt_alloc_id(mnt);

199

err = mnt_alloc_id(mnt);

200

if (err)

200

if (err)

201

goto out_free_cache;

201

goto out_free_cache;

202

203

if (name) {

203

if (name) {

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

204

mnt->mnt_devname = kstrdup(name, GFP_KERNEL);

205

if (!mnt->mnt_devname)

205

if (!mnt->mnt_devname)

206

goto out_free_id;

206

goto out_free_id;

207

}

207

}

208

209

#ifdef CONFIG_SMP

209

#ifdef CONFIG_SMP

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

210

mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);

211

if (!mnt->mnt_pcp)

211

if (!mnt->mnt_pcp)

212

goto out_free_devname;

212

goto out_free_devname;

213

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

214

this_cpu_add(mnt->mnt_pcp->mnt_count, 1);

215

#else

215

#else

216

mnt->mnt_count = 1;

216

mnt->mnt_count = 1;

217

mnt->mnt_writers = 0;

217

mnt->mnt_writers = 0;

218

#endif

218

#endif

219

220

INIT_HLIST_NODE(&mnt->mnt_hash);

220

INIT_HLIST_NODE(&mnt->mnt_hash);

221

INIT_LIST_HEAD(&mnt->mnt_child);

221

INIT_LIST_HEAD(&mnt->mnt_child);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

222

INIT_LIST_HEAD(&mnt->mnt_mounts);

223

INIT_LIST_HEAD(&mnt->mnt_list);

223

INIT_LIST_HEAD(&mnt->mnt_list);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

224

INIT_LIST_HEAD(&mnt->mnt_expire);

225

INIT_LIST_HEAD(&mnt->mnt_share);

225

INIT_LIST_HEAD(&mnt->mnt_share);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

226

INIT_LIST_HEAD(&mnt->mnt_slave_list);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

227

INIT_LIST_HEAD(&mnt->mnt_slave);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

228

INIT_HLIST_NODE(&mnt->mnt_mp_list);

229

#ifdef CONFIG_FSNOTIFY

229

#ifdef CONFIG_FSNOTIFY

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

230

INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);

231

#endif

231

#endif

232

}

232

}

233

return mnt;

233

return mnt;

234

235

#ifdef CONFIG_SMP

235

#ifdef CONFIG_SMP

236

out_free_devname:

236

out_free_devname:

237

kfree(mnt->mnt_devname);

237

kfree(mnt->mnt_devname);

238

#endif

238

#endif

239

out_free_id:

239

out_free_id:

240

mnt_free_id(mnt);

240

mnt_free_id(mnt);

241

out_free_cache:

241

out_free_cache:

242

kmem_cache_free(mnt_cache, mnt);

242

kmem_cache_free(mnt_cache, mnt);

243

return NULL;

243

return NULL;

244

}

244

}

245

246

/*

246

/*

247

* Most r/o checks on a fs are for operations that take

247

* Most r/o checks on a fs are for operations that take

248

* discrete amounts of time, like a write() or unlink().

248

* discrete amounts of time, like a write() or unlink().

249

* We must keep track of when those operations start

249

* We must keep track of when those operations start

250

* (for permission checks) and when they end, so that

250

* (for permission checks) and when they end, so that

251

* we can determine when writes are able to occur to

251

* we can determine when writes are able to occur to

252

* a filesystem.

252

* a filesystem.

253

*/

253

*/

254

/*

254

/*

255

* __mnt_is_readonly: check whether a mount is read-only

255

* __mnt_is_readonly: check whether a mount is read-only

256

* @mnt: the mount to check for its write status

256

* @mnt: the mount to check for its write status

257

*

257

*

258

* This shouldn't be used directly ouside of the VFS.

258

* This shouldn't be used directly ouside of the VFS.

259

* It does not guarantee that the filesystem will stay

259

* It does not guarantee that the filesystem will stay

260

* r/w, just that it is right *now*. This can not and

260

* r/w, just that it is right *now*. This can not and

261

* should not be used in place of IS_RDONLY(inode).

261

* should not be used in place of IS_RDONLY(inode).

262

* mnt_want/drop_write() will _keep_ the filesystem

262

* mnt_want/drop_write() will _keep_ the filesystem

263

* r/w.

263

* r/w.

264

*/

264

*/

265

int __mnt_is_readonly(struct vfsmount *mnt)

265

int __mnt_is_readonly(struct vfsmount *mnt)

266

{

266

{

267

if (mnt->mnt_flags & MNT_READONLY)

267

if (mnt->mnt_flags & MNT_READONLY)

268

return 1;

268

return 1;

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

269

if (mnt->mnt_sb->s_flags & MS_RDONLY)

270

return 1;

270

return 1;

271

return 0;

271

return 0;

272

}

272

}

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

273

EXPORT_SYMBOL_GPL(__mnt_is_readonly);

274

275

static inline void mnt_inc_writers(struct mount *mnt)

275

static inline void mnt_inc_writers(struct mount *mnt)

276

{

276

{

277

#ifdef CONFIG_SMP

277

#ifdef CONFIG_SMP

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

278

this_cpu_inc(mnt->mnt_pcp->mnt_writers);

279

#else

279

#else

280

mnt->mnt_writers++;

280

mnt->mnt_writers++;

281

#endif

281

#endif

282

}

282

}

283

284

static inline void mnt_dec_writers(struct mount *mnt)

284

static inline void mnt_dec_writers(struct mount *mnt)

285

{

285

{

286

#ifdef CONFIG_SMP

286

#ifdef CONFIG_SMP

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

287

this_cpu_dec(mnt->mnt_pcp->mnt_writers);

288

#else

288

#else

289

mnt->mnt_writers--;

289

mnt->mnt_writers--;

290

#endif

290

#endif

291

}

291

}

292

293

static unsigned int mnt_get_writers(struct mount *mnt)

293

static unsigned int mnt_get_writers(struct mount *mnt)

294

{

294

{

295

#ifdef CONFIG_SMP

295

#ifdef CONFIG_SMP

296

unsigned int count = 0;

296

unsigned int count = 0;

297

int cpu;

297

int cpu;

298

299

for_each_possible_cpu(cpu) {

299

for_each_possible_cpu(cpu) {

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

300

count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;

301

}

301

}

302

303

return count;

303

return count;

304

#else

304

#else

305

return mnt->mnt_writers;

305

return mnt->mnt_writers;

306

#endif

306

#endif

307

}

307

}

308

309

static int mnt_is_readonly(struct vfsmount *mnt)

309

static int mnt_is_readonly(struct vfsmount *mnt)

310

{

310

{

311

if (mnt->mnt_sb->s_readonly_remount)

311

if (mnt->mnt_sb->s_readonly_remount)

312

return 1;

312

return 1;

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

313

/* Order wrt setting s_flags/s_readonly_remount in do_remount() */

314

smp_rmb();

314

smp_rmb();

315

return __mnt_is_readonly(mnt);

315

return __mnt_is_readonly(mnt);

316

}

316

}

317

318

/*

318

/*

319

* Most r/o & frozen checks on a fs are for operations that take discrete

319

* Most r/o & frozen checks on a fs are for operations that take discrete

320

* amounts of time, like a write() or unlink(). We must keep track of when

320

* amounts of time, like a write() or unlink(). We must keep track of when

321

* those operations start (for permission checks) and when they end, so that we

321

* those operations start (for permission checks) and when they end, so that we

322

* can determine when writes are able to occur to a filesystem.

322

* can determine when writes are able to occur to a filesystem.

323

*/

323

*/

324

/**

324

/**

325

* __mnt_want_write - get write access to a mount without freeze protection

325

* __mnt_want_write - get write access to a mount without freeze protection

326

* @m: the mount on which to take a write

326

* @m: the mount on which to take a write

327

*

327

*

328

* This tells the low-level filesystem that a write is about to be performed to

328

* This tells the low-level filesystem that a write is about to be performed to

329

* it, and makes sure that writes are allowed (mnt it read-write) before

329

* it, and makes sure that writes are allowed (mnt it read-write) before

330

* returning success. This operation does not protect against filesystem being

330

* returning success. This operation does not protect against filesystem being

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

331

* frozen. When the write operation is finished, __mnt_drop_write() must be

332

* called. This is effectively a refcount.

332

* called. This is effectively a refcount.

333

*/

333

*/

334

int __mnt_want_write(struct vfsmount *m)

334

int __mnt_want_write(struct vfsmount *m)

335

{

335

{

336

struct mount *mnt = real_mount(m);

336

struct mount *mnt = real_mount(m);

337

int ret = 0;

337

int ret = 0;

338

339

preempt_disable();

339

preempt_disable();

340

mnt_inc_writers(mnt);

340

mnt_inc_writers(mnt);

341

/*

341

/*

342

* The store to mnt_inc_writers must be visible before we pass

342

* The store to mnt_inc_writers must be visible before we pass

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

343

* MNT_WRITE_HOLD loop below, so that the slowpath can see our

344

* incremented count after it has set MNT_WRITE_HOLD.

344

* incremented count after it has set MNT_WRITE_HOLD.

345

*/

345

*/

346

smp_mb();

346

smp_mb();

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

347

while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)

348

cpu_relax();

348

cpu_relax();

349

/*

349

/*

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

350

* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will

351

* be set to match its requirements. So we must not load that until

351

* be set to match its requirements. So we must not load that until

352

* MNT_WRITE_HOLD is cleared.

352

* MNT_WRITE_HOLD is cleared.

353

*/

353

*/

354

smp_rmb();

354

smp_rmb();

355

if (mnt_is_readonly(m)) {

355

if (mnt_is_readonly(m)) {

356

mnt_dec_writers(mnt);

356

mnt_dec_writers(mnt);

357

ret = -EROFS;

357

ret = -EROFS;

358

}

358

}

359

preempt_enable();

359

preempt_enable();

360

361

return ret;

361

return ret;

362

}

362

}

363

364

/**

364

/**

365

* mnt_want_write - get write access to a mount

365

* mnt_want_write - get write access to a mount

366

* @m: the mount on which to take a write

366

* @m: the mount on which to take a write

367

*

367

*

368

* This tells the low-level filesystem that a write is about to be performed to

368

* This tells the low-level filesystem that a write is about to be performed to

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

369

* it, and makes sure that writes are allowed (mount is read-write, filesystem

370

* is not frozen) before returning success. When the write operation is

370

* is not frozen) before returning success. When the write operation is

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

371

* finished, mnt_drop_write() must be called. This is effectively a refcount.

372

*/

372

*/

373

int mnt_want_write(struct vfsmount *m)

373

int mnt_want_write(struct vfsmount *m)

374

{

374

{

375

int ret;

375

int ret;

376

377

sb_start_write(m->mnt_sb);

377

sb_start_write(m->mnt_sb);

378

ret = __mnt_want_write(m);

378

ret = __mnt_want_write(m);

379

if (ret)

379

if (ret)

380

sb_end_write(m->mnt_sb);

380

sb_end_write(m->mnt_sb);

381

return ret;

381

return ret;

382

}

382

}

383

EXPORT_SYMBOL_GPL(mnt_want_write);

383

EXPORT_SYMBOL_GPL(mnt_want_write);

384

385

/**

385

/**

386

* mnt_clone_write - get write access to a mount

386

* mnt_clone_write - get write access to a mount

387

* @mnt: the mount on which to take a write

387

* @mnt: the mount on which to take a write

388

*

388

*

389

* This is effectively like mnt_want_write, except

389

* This is effectively like mnt_want_write, except

390

* it must only be used to take an extra write reference

390

* it must only be used to take an extra write reference

391

* on a mountpoint that we already know has a write reference

391

* on a mountpoint that we already know has a write reference

392

* on it. This allows some optimisation.

392

* on it. This allows some optimisation.

393

*

393

*

394

* After finished, mnt_drop_write must be called as usual to

394

* After finished, mnt_drop_write must be called as usual to

395

* drop the reference.

395

* drop the reference.

396

*/

396

*/

397

int mnt_clone_write(struct vfsmount *mnt)

397

int mnt_clone_write(struct vfsmount *mnt)

398

{

398

{

399

/* superblock may be r/o */

399

/* superblock may be r/o */

400

if (__mnt_is_readonly(mnt))

400

if (__mnt_is_readonly(mnt))

401

return -EROFS;

401

return -EROFS;

402

preempt_disable();

402

preempt_disable();

403

mnt_inc_writers(real_mount(mnt));

403

mnt_inc_writers(real_mount(mnt));

404

preempt_enable();

404

preempt_enable();

405

return 0;

405

return 0;

406

}

406

}

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

407

EXPORT_SYMBOL_GPL(mnt_clone_write);

408

409

/**

409

/**

410

* __mnt_want_write_file - get write access to a file's mount

410

* __mnt_want_write_file - get write access to a file's mount

411

* @file: the file who's mount on which to take a write

411

* @file: the file who's mount on which to take a write

412

*

412

*

413

* This is like __mnt_want_write, but it takes a file and can

413

* This is like __mnt_want_write, but it takes a file and can

414

* do some optimisations if the file is open for write already

414

* do some optimisations if the file is open for write already

415

*/

415

*/

416

int __mnt_want_write_file(struct file *file)

416

int __mnt_want_write_file(struct file *file)

417

{

417

{

418

if (!(file->f_mode & FMODE_WRITER))

418

if (!(file->f_mode & FMODE_WRITER))

419

return __mnt_want_write(file->f_path.mnt);

419

return __mnt_want_write(file->f_path.mnt);

420

else

420

else

421

return mnt_clone_write(file->f_path.mnt);

421

return mnt_clone_write(file->f_path.mnt);

422

}

422

}

423

424

/**

424

/**

425

* mnt_want_write_file - get write access to a file's mount

425

* mnt_want_write_file - get write access to a file's mount

426

* @file: the file who's mount on which to take a write

426

* @file: the file who's mount on which to take a write

427

*

427

*

428

* This is like mnt_want_write, but it takes a file and can

428

* This is like mnt_want_write, but it takes a file and can

429

* do some optimisations if the file is open for write already

429

* do some optimisations if the file is open for write already

430

*/

430

*/

431

int mnt_want_write_file(struct file *file)

431

int mnt_want_write_file(struct file *file)

432

{

432

{

433

int ret;

433

int ret;

434

435

sb_start_write(file->f_path.mnt->mnt_sb);

435

sb_start_write(file->f_path.mnt->mnt_sb);

436

ret = __mnt_want_write_file(file);

436

ret = __mnt_want_write_file(file);

437

if (ret)

437

if (ret)

438

sb_end_write(file->f_path.mnt->mnt_sb);

438

sb_end_write(file->f_path.mnt->mnt_sb);

439

return ret;

439

return ret;

440

}

440

}

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

441

EXPORT_SYMBOL_GPL(mnt_want_write_file);

442

443

/**

443

/**

444

* __mnt_drop_write - give up write access to a mount

444

* __mnt_drop_write - give up write access to a mount

445

* @mnt: the mount on which to give up write access

445

* @mnt: the mount on which to give up write access

446

*

446

*

447

* Tells the low-level filesystem that we are done

447

* Tells the low-level filesystem that we are done

448

* performing writes to it. Must be matched with

448

* performing writes to it. Must be matched with

449

* __mnt_want_write() call above.

449

* __mnt_want_write() call above.

450

*/

450

*/

451

void __mnt_drop_write(struct vfsmount *mnt)

451

void __mnt_drop_write(struct vfsmount *mnt)

452

{

452

{

453

preempt_disable();

453

preempt_disable();

454

mnt_dec_writers(real_mount(mnt));

454

mnt_dec_writers(real_mount(mnt));

455

preempt_enable();

455

preempt_enable();

456

}

456

}

457

458

/**

458

/**

459

* mnt_drop_write - give up write access to a mount

459

* mnt_drop_write - give up write access to a mount

460

* @mnt: the mount on which to give up write access

460

* @mnt: the mount on which to give up write access

461

*

461

*

462

* Tells the low-level filesystem that we are done performing writes to it and

462

* Tells the low-level filesystem that we are done performing writes to it and

463

* also allows filesystem to be frozen again. Must be matched with

463

* also allows filesystem to be frozen again. Must be matched with

464

* mnt_want_write() call above.

464

* mnt_want_write() call above.

465

*/

465

*/

466

void mnt_drop_write(struct vfsmount *mnt)

466

void mnt_drop_write(struct vfsmount *mnt)

467

{

467

{

468

__mnt_drop_write(mnt);

468

__mnt_drop_write(mnt);

469

sb_end_write(mnt->mnt_sb);

469

sb_end_write(mnt->mnt_sb);

470

}

470

}

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

471

EXPORT_SYMBOL_GPL(mnt_drop_write);

472

473

void __mnt_drop_write_file(struct file *file)

473

void __mnt_drop_write_file(struct file *file)

474

{

474

{

475

__mnt_drop_write(file->f_path.mnt);

475

__mnt_drop_write(file->f_path.mnt);

476

}

476

}

477

478

void mnt_drop_write_file(struct file *file)

478

void mnt_drop_write_file(struct file *file)

479

{

479

{

480

mnt_drop_write(file->f_path.mnt);

480

mnt_drop_write(file->f_path.mnt);

481

}

481

}

482

EXPORT_SYMBOL(mnt_drop_write_file);

482

EXPORT_SYMBOL(mnt_drop_write_file);

483

484

static int mnt_make_readonly(struct mount *mnt)

484

static int mnt_make_readonly(struct mount *mnt)

485

{

485

{

486

int ret = 0;

486

int ret = 0;

487

488

lock_mount_hash();

488

lock_mount_hash();

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

489

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

490

/*

490

/*

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

491

* After storing MNT_WRITE_HOLD, we'll read the counters. This store

492

* should be visible before we do.

492

* should be visible before we do.

493

*/

493

*/

494

smp_mb();

494

smp_mb();

495

496

/*

496

/*

497

* With writers on hold, if this value is zero, then there are

497

* With writers on hold, if this value is zero, then there are

498

* definitely no active writers (although held writers may subsequently

498

* definitely no active writers (although held writers may subsequently

499

* increment the count, they'll have to wait, and decrement it after

499

* increment the count, they'll have to wait, and decrement it after

500

* seeing MNT_READONLY).

500

* seeing MNT_READONLY).

501

*

501

*

502

* It is OK to have counter incremented on one CPU and decremented on

502

* It is OK to have counter incremented on one CPU and decremented on

503

* another: the sum will add up correctly. The danger would be when we

503

* another: the sum will add up correctly. The danger would be when we

504

* sum up each counter, if we read a counter before it is incremented,

504

* sum up each counter, if we read a counter before it is incremented,

505

* but then read another CPU's count which it has been subsequently

505

* but then read another CPU's count which it has been subsequently

506

* decremented from -- we would see more decrements than we should.

506

* decremented from -- we would see more decrements than we should.

507

* MNT_WRITE_HOLD protects against this scenario, because

507

* MNT_WRITE_HOLD protects against this scenario, because

508

* mnt_want_write first increments count, then smp_mb, then spins on

508

* mnt_want_write first increments count, then smp_mb, then spins on

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

509

* MNT_WRITE_HOLD, so it can't be decremented by another CPU while

510

* we're counting up here.

510

* we're counting up here.

511

*/

511

*/

512

if (mnt_get_writers(mnt) > 0)

512

if (mnt_get_writers(mnt) > 0)

513

ret = -EBUSY;

513

ret = -EBUSY;

514

else

514

else

515

mnt->mnt.mnt_flags |= MNT_READONLY;

515

mnt->mnt.mnt_flags |= MNT_READONLY;

516

/*

516

/*

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

517

* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers

518

* that become unheld will see MNT_READONLY.

518

* that become unheld will see MNT_READONLY.

519

*/

519

*/

520

smp_wmb();

520

smp_wmb();

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

521

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

522

unlock_mount_hash();

522

unlock_mount_hash();

523

return ret;

523

return ret;

524

}

524

}

525

526

static void __mnt_unmake_readonly(struct mount *mnt)

526

static void __mnt_unmake_readonly(struct mount *mnt)

527

{

527

{

528

lock_mount_hash();

528

lock_mount_hash();

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

529

mnt->mnt.mnt_flags &= ~MNT_READONLY;

530

unlock_mount_hash();

530

unlock_mount_hash();

531

}

531

}

532

533

int sb_prepare_remount_readonly(struct super_block *sb)

533

int sb_prepare_remount_readonly(struct super_block *sb)

534

{

534

{

535

struct mount *mnt;

535

struct mount *mnt;

536

int err = 0;

536

int err = 0;

537

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

538

/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */

539

if (atomic_long_read(&sb->s_remove_count))

539

if (atomic_long_read(&sb->s_remove_count))

540

return -EBUSY;

540

return -EBUSY;

541

542

lock_mount_hash();

542

lock_mount_hash();

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

543

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

544

if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

545

mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;

546

smp_mb();

546

smp_mb();

547

if (mnt_get_writers(mnt) > 0) {

547

if (mnt_get_writers(mnt) > 0) {

548

err = -EBUSY;

548

err = -EBUSY;

549

break;

549

break;

550

}

550

}

551

}

551

}

552

}

552

}

553

if (!err && atomic_long_read(&sb->s_remove_count))

553

if (!err && atomic_long_read(&sb->s_remove_count))

554

err = -EBUSY;

554

err = -EBUSY;

555

556

if (!err) {

556

if (!err) {

557

sb->s_readonly_remount = 1;

557

sb->s_readonly_remount = 1;

558

smp_wmb();

558

smp_wmb();

559

}

559

}

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

560

list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

561

if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

562

mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;

563

}

563

}

564

unlock_mount_hash();

564

unlock_mount_hash();

565

566

return err;

566

return err;

567

}

567

}

568

569

static void free_vfsmnt(struct mount *mnt)

569

static void free_vfsmnt(struct mount *mnt)

570

{

570

{

571

kfree(mnt->mnt_devname);

571

kfree(mnt->mnt_devname);

572

#ifdef CONFIG_SMP

572

#ifdef CONFIG_SMP

573

free_percpu(mnt->mnt_pcp);

573

free_percpu(mnt->mnt_pcp);

574

#endif

574

#endif

575

kmem_cache_free(mnt_cache, mnt);

575

kmem_cache_free(mnt_cache, mnt);

576

}

576

}

577

578

static void delayed_free_vfsmnt(struct rcu_head *head)

578

static void delayed_free_vfsmnt(struct rcu_head *head)

579

{

579

{

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

580

free_vfsmnt(container_of(head, struct mount, mnt_rcu));

581

}

581

}

582

583

/* call under rcu_read_lock */

583

/* call under rcu_read_lock */

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

584

bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)

585

{

585

{

586

struct mount *mnt;

586

struct mount *mnt;

587

if (read_seqretry(&mount_lock, seq))

587

if (read_seqretry(&mount_lock, seq))

588

return false;

588

return false;

589

if (bastard == NULL)

589

if (bastard == NULL)

590

return true;

590

return true;

591

mnt = real_mount(bastard);

591

mnt = real_mount(bastard);

592

mnt_add_count(mnt, 1);

592

mnt_add_count(mnt, 1);

593

if (likely(!read_seqretry(&mount_lock, seq)))

593

if (likely(!read_seqretry(&mount_lock, seq)))

594

return true;

594

return true;

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

595

if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {

596

mnt_add_count(mnt, -1);

596

mnt_add_count(mnt, -1);

597

return false;

597

return false;

598

}

598

}

599

rcu_read_unlock();

599

rcu_read_unlock();

600

mntput(bastard);

600

mntput(bastard);

601

rcu_read_lock();

601

rcu_read_lock();

602

return false;

602

return false;

603

}

603

}

604

605

/*

605

/*

606

* find the first mount at @dentry on vfsmount @mnt.

606

* find the first mount at @dentry on vfsmount @mnt.

607

* call under rcu_read_lock()

607

* call under rcu_read_lock()

608

*/

608

*/

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

609

struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)

610

{

610

{

611

struct hlist_head *head = m_hash(mnt, dentry);

611

struct hlist_head *head = m_hash(mnt, dentry);

612

struct mount *p;

612

struct mount *p;

613

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

614

hlist_for_each_entry_rcu(p, head, mnt_hash)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

615

if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)

616

return p;

616

return p;

617

return NULL;

617

return NULL;

618

}

618

}

619

620

/*

620

/*

621

* find the last mount at @dentry on vfsmount @mnt.

621

* find the last mount at @dentry on vfsmount @mnt.

622

* mount_lock must be held.

622

* mount_lock must be held.

623

*/

623

*/

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

624

struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)

625

{

625

{

626

struct mount *p, *res;

626

struct mount *p, *res;

627

res = p = __lookup_mnt(mnt, dentry);

627

res = p = __lookup_mnt(mnt, dentry);

628

if (!p)

628

if (!p)

629

goto out;

629

goto out;

630

hlist_for_each_entry_continue(p, mnt_hash) {

630

hlist_for_each_entry_continue(p, mnt_hash) {

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

631

if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)

632

break;

632

break;

633

res = p;

633

res = p;

634

}

634

}

635

out:

635

out:

636

return res;

636

return res;

637

}

637

}

638

639

/*

639

/*

640

* lookup_mnt - Return the first child mount mounted at path

640

* lookup_mnt - Return the first child mount mounted at path

641

*

641

*

642

* "First" means first mounted chronologically. If you create the

642

* "First" means first mounted chronologically. If you create the

643

* following mounts:

643

* following mounts:

644

*

644

*

645

* mount /dev/sda1 /mnt

645

* mount /dev/sda1 /mnt

646

* mount /dev/sda2 /mnt

646

* mount /dev/sda2 /mnt

647

* mount /dev/sda3 /mnt

647

* mount /dev/sda3 /mnt

648

*

648

*

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

649

* Then lookup_mnt() on the base /mnt dentry in the root mount will

650

* return successively the root dentry and vfsmount of /dev/sda1, then

650

* return successively the root dentry and vfsmount of /dev/sda1, then

651

* /dev/sda2, then /dev/sda3, then NULL.

651

* /dev/sda2, then /dev/sda3, then NULL.

652

*

652

*

653

* lookup_mnt takes a reference to the found vfsmount.

653

* lookup_mnt takes a reference to the found vfsmount.

654

*/

654

*/

655

struct vfsmount *lookup_mnt(struct path *path)

655

struct vfsmount *lookup_mnt(struct path *path)

656

{

656

{

657

struct mount *child_mnt;

657

struct mount *child_mnt;

658

struct vfsmount *m;

658

struct vfsmount *m;

659

unsigned seq;

659

unsigned seq;

660

661

rcu_read_lock();

661

rcu_read_lock();

662

do {

662

do {

663

seq = read_seqbegin(&mount_lock);

663

seq = read_seqbegin(&mount_lock);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

664

child_mnt = __lookup_mnt(path->mnt, path->dentry);

665

m = child_mnt ? &child_mnt->mnt : NULL;

665

m = child_mnt ? &child_mnt->mnt : NULL;

666

} while (!legitimize_mnt(m, seq));

666

} while (!legitimize_mnt(m, seq));

667

rcu_read_unlock();

667

rcu_read_unlock();

668

return m;

668

return m;

669

}

669

}

670

671

/*

671

/*

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

672

* __is_local_mountpoint - Test to see if dentry is a mountpoint in the

673

* current mount namespace.

673

* current mount namespace.

674

*

674

*

675

* The common case is dentries are not mountpoints at all and that

675

* The common case is dentries are not mountpoints at all and that

676

* test is handled inline. For the slow case when we are actually

676

* test is handled inline. For the slow case when we are actually

677

* dealing with a mountpoint of some kind, walk through all of the

677

* dealing with a mountpoint of some kind, walk through all of the

678

* mounts in the current mount namespace and test to see if the dentry

678

* mounts in the current mount namespace and test to see if the dentry

679

* is a mountpoint.

679

* is a mountpoint.

680

*

680

*

681

* The mount_hashtable is not usable in the context because we

681

* The mount_hashtable is not usable in the context because we

682

* need to identify all mounts that may be in the current mount

682

* need to identify all mounts that may be in the current mount

683

* namespace not just a mount that happens to have some specified

683

* namespace not just a mount that happens to have some specified

684

* parent mount.

684

* parent mount.

685

*/

685

*/

686

bool __is_local_mountpoint(struct dentry *dentry)

686

bool __is_local_mountpoint(struct dentry *dentry)

687

{

687

{

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

688

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

689

struct mount *mnt;

689

struct mount *mnt;

690

bool is_covered = false;

690

bool is_covered = false;

691

692

if (!d_mountpoint(dentry))

692

if (!d_mountpoint(dentry))

693

goto out;

693

goto out;

694

695

down_read(&namespace_sem);

695

down_read(&namespace_sem);

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

696

list_for_each_entry(mnt, &ns->list, mnt_list) {

697

is_covered = (mnt->mnt_mountpoint == dentry);

697

is_covered = (mnt->mnt_mountpoint == dentry);

698

if (is_covered)

698

if (is_covered)

699

break;

699

break;

700

}

700

}

701

up_read(&namespace_sem);

701

up_read(&namespace_sem);

702

out:

702

out:

703

return is_covered;

703

return is_covered;

704

}

704

}

705

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

706

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)

707

{

707

{

708

struct hlist_head *chain = mp_hash(dentry);

708

struct hlist_head *chain = mp_hash(dentry);

709

struct mountpoint *mp;

709

struct mountpoint *mp;

710

711

hlist_for_each_entry(mp, chain, m_hash) {

711

hlist_for_each_entry(mp, chain, m_hash) {

712

if (mp->m_dentry == dentry) {

712

if (mp->m_dentry == dentry) {

713

/* might be worth a WARN_ON() */

713

/* might be worth a WARN_ON() */

714

if (d_unlinked(dentry))

714

if (d_unlinked(dentry))

715

return ERR_PTR(-ENOENT);

715

return ERR_PTR(-ENOENT);

716

mp->m_count++;

716

mp->m_count++;

717

return mp;

717

return mp;

718

}

718

}

719

}

719

}

720

return NULL;

720

return NULL;

721

}

721

}

722

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

723

static struct mountpoint *new_mountpoint(struct dentry *dentry)

724

{

724

{

725

struct hlist_head *chain = mp_hash(dentry);

725

struct hlist_head *chain = mp_hash(dentry);

726

struct mountpoint *mp;

726

struct mountpoint *mp;

727

int ret;

727

int ret;

728

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

729

mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);

730

if (!mp)

730

if (!mp)

731

return ERR_PTR(-ENOMEM);

731

return ERR_PTR(-ENOMEM);

732

733

ret = d_set_mounted(dentry);

733

ret = d_set_mounted(dentry);

734

if (ret) {

734

if (ret) {

735

kfree(mp);

735

kfree(mp);

736

return ERR_PTR(ret);

736

return ERR_PTR(ret);

737

}

737

}

738

739

mp->m_dentry = dentry;

739

mp->m_dentry = dentry;

740

mp->m_count = 1;

740

mp->m_count = 1;

741

hlist_add_head(&mp->m_hash, chain);

741

hlist_add_head(&mp->m_hash, chain);

742

INIT_HLIST_HEAD(&mp->m_list);

742

INIT_HLIST_HEAD(&mp->m_list);

743

return mp;

743

return mp;

744

}

744

}

745

746

static void put_mountpoint(struct mountpoint *mp)

746

static void put_mountpoint(struct mountpoint *mp)

747

{

747

{

748

if (!--mp->m_count) {

748

if (!--mp->m_count) {

749

struct dentry *dentry = mp->m_dentry;

749

struct dentry *dentry = mp->m_dentry;

750

BUG_ON(!hlist_empty(&mp->m_list));

750

BUG_ON(!hlist_empty(&mp->m_list));

751

spin_lock(&dentry->d_lock);

751

spin_lock(&dentry->d_lock);

752

dentry->d_flags &= ~DCACHE_MOUNTED;

752

dentry->d_flags &= ~DCACHE_MOUNTED;

753

spin_unlock(&dentry->d_lock);

753

spin_unlock(&dentry->d_lock);

754

hlist_del(&mp->m_hash);

754

hlist_del(&mp->m_hash);

755

kfree(mp);

755

kfree(mp);

756

}

756

}

757

}

757

}

758

759

static inline int check_mnt(struct mount *mnt)

759

static inline int check_mnt(struct mount *mnt)

760

{

760

{

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

761

return mnt->mnt_ns == current->nsproxy->mnt_ns;

762

}

762

}

763

764

/*

764

/*

765

* vfsmount lock must be held for write

765

* vfsmount lock must be held for write

766

*/

766

*/

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

767

static void touch_mnt_namespace(struct mnt_namespace *ns)

768

{

768

{

769

if (ns) {

769

if (ns) {

770

ns->event = ++event;

770

ns->event = ++event;

771

wake_up_interruptible(&ns->poll);

771

wake_up_interruptible(&ns->poll);

772

}

772

}

773

}

773

}

774

775

/*

775

/*

776

* vfsmount lock must be held for write

776

* vfsmount lock must be held for write

777

*/

777

*/

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

778

static void __touch_mnt_namespace(struct mnt_namespace *ns)

779

{

779

{

780

if (ns && ns->event != event) {

780

if (ns && ns->event != event) {

781

ns->event = event;

781

ns->event = event;

782

wake_up_interruptible(&ns->poll);

782

wake_up_interruptible(&ns->poll);

783

}

783

}

784

}

784

}

785

786

/*

786

/*

787

* vfsmount lock must be held for write

787

* vfsmount lock must be held for write

788

*/

788

*/

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

789

static void detach_mnt(struct mount *mnt, struct path *old_path)

790

{

790

{

791

old_path->dentry = mnt->mnt_mountpoint;

791

old_path->dentry = mnt->mnt_mountpoint;

792

old_path->mnt = &mnt->mnt_parent->mnt;

792

old_path->mnt = &mnt->mnt_parent->mnt;

793

mnt->mnt_parent = mnt;

793

mnt->mnt_parent = mnt;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

794

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

795

list_del_init(&mnt->mnt_child);

795

list_del_init(&mnt->mnt_child);

796

hlist_del_init_rcu(&mnt->mnt_hash);

796

hlist_del_init_rcu(&mnt->mnt_hash);

797

hlist_del_init(&mnt->mnt_mp_list);

797

hlist_del_init(&mnt->mnt_mp_list);

798

put_mountpoint(mnt->mnt_mp);

798

put_mountpoint(mnt->mnt_mp);

799

mnt->mnt_mp = NULL;

799

mnt->mnt_mp = NULL;

800

}

800

}

801

802

/*

802

/*

803

* vfsmount lock must be held for write

803

* vfsmount lock must be held for write

804

*/

804

*/

805

void mnt_set_mountpoint(struct mount *mnt,

805

void mnt_set_mountpoint(struct mount *mnt,

806

struct mountpoint *mp,

806

struct mountpoint *mp,

807

struct mount *child_mnt)

807

struct mount *child_mnt)

808

{

808

{

809

mp->m_count++;

809

mp->m_count++;

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

810

mnt_add_count(mnt, 1); /* essentially, that's mntget */

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

811

child_mnt->mnt_mountpoint = dget(mp->m_dentry);

812

child_mnt->mnt_parent = mnt;

812

child_mnt->mnt_parent = mnt;

813

child_mnt->mnt_mp = mp;

813

child_mnt->mnt_mp = mp;

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

814

hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);

815

}

815

}

816

817

/*

817

/*

818

* vfsmount lock must be held for write

818

* vfsmount lock must be held for write

819

*/

819

*/

820

static void attach_mnt(struct mount *mnt,

820

static void attach_mnt(struct mount *mnt,

821

struct mount *parent,

821

struct mount *parent,

822

struct mountpoint *mp)

822

struct mountpoint *mp)

823

{

823

{

824

mnt_set_mountpoint(parent, mp, mnt);

824

mnt_set_mountpoint(parent, mp, mnt);

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

825

hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

826

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

827

}

827

}

828

829

static void attach_shadowed(struct mount *mnt,

829

static void attach_shadowed(struct mount *mnt,

830

struct mount *parent,

830

struct mount *parent,

831

struct mount *shadows)

831

struct mount *shadows)

832

{

832

{

833

if (shadows) {

833

if (shadows) {

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

834

hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

835

list_add(&mnt->mnt_child, &shadows->mnt_child);

836

} else {

836

} else {

837

hlist_add_head_rcu(&mnt->mnt_hash,

837

hlist_add_head_rcu(&mnt->mnt_hash,

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

838

m_hash(&parent->mnt, mnt->mnt_mountpoint));

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

839

list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);

840

}

840

}

841

}

841

}

842

843

/*

843

/*

844

* vfsmount lock must be held for write

844

* vfsmount lock must be held for write

845

*/

845

*/

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

846

static void commit_tree(struct mount *mnt, struct mount *shadows)

847

{

847

{

848

struct mount *parent = mnt->mnt_parent;

848

struct mount *parent = mnt->mnt_parent;

849

struct mount *m;

849

struct mount *m;

850

LIST_HEAD(head);

850

LIST_HEAD(head);

851

struct mnt_namespace *n = parent->mnt_ns;

851

struct mnt_namespace *n = parent->mnt_ns;

852

853

BUG_ON(parent == mnt);

853

BUG_ON(parent == mnt);

854

855

list_add_tail(&head, &mnt->mnt_list);

855

list_add_tail(&head, &mnt->mnt_list);

856

list_for_each_entry(m, &head, mnt_list)

856

list_for_each_entry(m, &head, mnt_list)

857

m->mnt_ns = n;

857

m->mnt_ns = n;

858

859

list_splice(&head, n->list.prev);

859

list_splice(&head, n->list.prev);

860

861

attach_shadowed(mnt, parent, shadows);

861

attach_shadowed(mnt, parent, shadows);

862

touch_mnt_namespace(n);

862

touch_mnt_namespace(n);

863

}

863

}

864

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

865

static struct mount *next_mnt(struct mount *p, struct mount *root)

866

{

866

{

867

struct list_head *next = p->mnt_mounts.next;

867

struct list_head *next = p->mnt_mounts.next;

868

if (next == &p->mnt_mounts) {

868

if (next == &p->mnt_mounts) {

869

while (1) {

869

while (1) {

870

if (p == root)

870

if (p == root)

871

return NULL;

871

return NULL;

872

next = p->mnt_child.next;

872

next = p->mnt_child.next;

873

if (next != &p->mnt_parent->mnt_mounts)

873

if (next != &p->mnt_parent->mnt_mounts)

874

break;

874

break;

875

p = p->mnt_parent;

875

p = p->mnt_parent;

876

}

876

}

877

}

877

}

878

return list_entry(next, struct mount, mnt_child);

878

return list_entry(next, struct mount, mnt_child);

879

}

879

}

880

881

static struct mount *skip_mnt_tree(struct mount *p)

881

static struct mount *skip_mnt_tree(struct mount *p)

882

{

882

{

883

struct list_head *prev = p->mnt_mounts.prev;

883

struct list_head *prev = p->mnt_mounts.prev;

884

while (prev != &p->mnt_mounts) {

884

while (prev != &p->mnt_mounts) {

885

p = list_entry(prev, struct mount, mnt_child);

885

p = list_entry(prev, struct mount, mnt_child);

886

prev = p->mnt_mounts.prev;

886

prev = p->mnt_mounts.prev;

887

}

887

}

888

return p;

888

return p;

889

}

889

}

890

891

struct vfsmount *

891

struct vfsmount *

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

892

vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)

893

{

893

{

894

struct mount *mnt;

894

struct mount *mnt;

895

struct dentry *root;

895

struct dentry *root;

896

897

if (!type)

897

if (!type)

898

return ERR_PTR(-ENODEV);

898

return ERR_PTR(-ENODEV);

899

900

mnt = alloc_vfsmnt(name);

900

mnt = alloc_vfsmnt(name);

901

if (!mnt)

901

if (!mnt)

902

return ERR_PTR(-ENOMEM);

902

return ERR_PTR(-ENOMEM);

903

904

if (flags & MS_KERNMOUNT)

904

if (flags & MS_KERNMOUNT)

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

905

mnt->mnt.mnt_flags = MNT_INTERNAL;

906

907

root = mount_fs(type, flags, name, data);

907

root = mount_fs(type, flags, name, data);

908

if (IS_ERR(root)) {

908

if (IS_ERR(root)) {

909

mnt_free_id(mnt);

909

mnt_free_id(mnt);

910

free_vfsmnt(mnt);

910

free_vfsmnt(mnt);

911

return ERR_CAST(root);

911

return ERR_CAST(root);

912

}

912

}

913

914

mnt->mnt.mnt_root = root;

914

mnt->mnt.mnt_root = root;

915

mnt->mnt.mnt_sb = root->d_sb;

915

mnt->mnt.mnt_sb = root->d_sb;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

916

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

917

mnt->mnt_parent = mnt;

917

mnt->mnt_parent = mnt;

918

lock_mount_hash();

918

lock_mount_hash();

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

919

list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);

920

unlock_mount_hash();

920

unlock_mount_hash();

921

return &mnt->mnt;

921

return &mnt->mnt;

922

}

922

}

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

923

EXPORT_SYMBOL_GPL(vfs_kern_mount);

924

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

925

static struct mount *clone_mnt(struct mount *old, struct dentry *root,

926

int flag)

926

int flag)

927

{

927

{

928

struct super_block *sb = old->mnt.mnt_sb;

928

struct super_block *sb = old->mnt.mnt_sb;

929

struct mount *mnt;

929

struct mount *mnt;

930

int err;

930

int err;

931

932

mnt = alloc_vfsmnt(old->mnt_devname);

932

mnt = alloc_vfsmnt(old->mnt_devname);

933

if (!mnt)

933

if (!mnt)

934

return ERR_PTR(-ENOMEM);

934

return ERR_PTR(-ENOMEM);

935

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

936

if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))

937

mnt->mnt_group_id = 0; /* not a peer of original */

937

mnt->mnt_group_id = 0; /* not a peer of original */

938

else

938

else

939

mnt->mnt_group_id = old->mnt_group_id;

939

mnt->mnt_group_id = old->mnt_group_id;

940

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

941

if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {

942

err = mnt_alloc_group_id(mnt);

942

err = mnt_alloc_group_id(mnt);

943

if (err)

943

if (err)

944

goto out_free;

944

goto out_free;

945

}

945

}

946

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

947

mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);

948

/* Don't allow unprivileged users to change mount flags */

948

/* Don't allow unprivileged users to change mount flags */

949

if (flag & CL_UNPRIVILEGED) {

949

if (flag & CL_UNPRIVILEGED) {

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

950

mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

951

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

952

if (mnt->mnt.mnt_flags & MNT_READONLY)

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

953

mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

954

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

955

if (mnt->mnt.mnt_flags & MNT_NODEV)

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

956

mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

957

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

958

if (mnt->mnt.mnt_flags & MNT_NOSUID)

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

959

mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

960

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

961

if (mnt->mnt.mnt_flags & MNT_NOEXEC)

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

962

mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;

963

}

963

}

964

965

/* Don't allow unprivileged users to reveal what is under a mount */

965

/* Don't allow unprivileged users to reveal what is under a mount */

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

966

if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

967

mnt->mnt.mnt_flags |= MNT_LOCKED;

968

969

atomic_inc(&sb->s_active);

969

atomic_inc(&sb->s_active);

970

mnt->mnt.mnt_sb = sb;

970

mnt->mnt.mnt_sb = sb;

971

mnt->mnt.mnt_root = dget(root);

971

mnt->mnt.mnt_root = dget(root);

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

972

mnt->mnt_mountpoint = mnt->mnt.mnt_root;

973

mnt->mnt_parent = mnt;

973

mnt->mnt_parent = mnt;

974

lock_mount_hash();

974

lock_mount_hash();

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

975

list_add_tail(&mnt->mnt_instance, &sb->s_mounts);

976

unlock_mount_hash();

976

unlock_mount_hash();

977

978

if ((flag & CL_SLAVE) ||

978

if ((flag & CL_SLAVE) ||

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

979

((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

980

list_add(&mnt->mnt_slave, &old->mnt_slave_list);

981

mnt->mnt_master = old;

981

mnt->mnt_master = old;

982

CLEAR_MNT_SHARED(mnt);

982

CLEAR_MNT_SHARED(mnt);

983

} else if (!(flag & CL_PRIVATE)) {

983

} else if (!(flag & CL_PRIVATE)) {

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

984

if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))

985

list_add(&mnt->mnt_share, &old->mnt_share);

985

list_add(&mnt->mnt_share, &old->mnt_share);

986

if (IS_MNT_SLAVE(old))

986

if (IS_MNT_SLAVE(old))

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

987

list_add(&mnt->mnt_slave, &old->mnt_slave);

988

mnt->mnt_master = old->mnt_master;

988

mnt->mnt_master = old->mnt_master;

989

}

989

}

990

if (flag & CL_MAKE_SHARED)

990

if (flag & CL_MAKE_SHARED)

991

set_mnt_shared(mnt);

991

set_mnt_shared(mnt);

992

993

/* stick the duplicate mount on the same expiry list

993

/* stick the duplicate mount on the same expiry list

994

* as the original if that was on one */

994

* as the original if that was on one */

995

if (flag & CL_EXPIRE) {

995

if (flag & CL_EXPIRE) {

996

if (!list_empty(&old->mnt_expire))

996

if (!list_empty(&old->mnt_expire))

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

997

list_add(&mnt->mnt_expire, &old->mnt_expire);

998

}

998

}

999

1000

return mnt;

1000

return mnt;

1001

1002

out_free:

1002

out_free:

1003

mnt_free_id(mnt);

1003

mnt_free_id(mnt);

1004

free_vfsmnt(mnt);

1004

free_vfsmnt(mnt);

1005

return ERR_PTR(err);

1005

return ERR_PTR(err);

1006

}

1006

}

1007

1008

static void cleanup_mnt(struct mount *mnt)

1008

static void cleanup_mnt(struct mount *mnt)

1009

{

1009

{

1010

/*

1010

/*

1011

* This probably indicates that somebody messed

1011

* This probably indicates that somebody messed

1012

* up a mnt_want/drop_write() pair. If this

1012

* up a mnt_want/drop_write() pair. If this

1013

* happens, the filesystem was probably unable

1013

* happens, the filesystem was probably unable

1014

* to make r/w->r/o transitions.

1014

* to make r/w->r/o transitions.

1015

*/

1015

*/

1016

/*

1016

/*

1017

* The locking used to deal with mnt_count decrement provides barriers,

1017

* The locking used to deal with mnt_count decrement provides barriers,

1018

* so mnt_get_writers() below is safe.

1018

* so mnt_get_writers() below is safe.

1019

*/

1019

*/

1020

WARN_ON(mnt_get_writers(mnt));

1020

WARN_ON(mnt_get_writers(mnt));

1021

if (unlikely(mnt->mnt_pins.first))

1021

if (unlikely(mnt->mnt_pins.first))

1022

mnt_pin_kill(mnt);

1022

mnt_pin_kill(mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1023

fsnotify_vfsmount_delete(&mnt->mnt);

1024

dput(mnt->mnt.mnt_root);

1024

dput(mnt->mnt.mnt_root);

1025

deactivate_super(mnt->mnt.mnt_sb);

1025

deactivate_super(mnt->mnt.mnt_sb);

1026

mnt_free_id(mnt);

1026

mnt_free_id(mnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1027

call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);

1028

}

1028

}

1029

1030

static void __cleanup_mnt(struct rcu_head *head)

1030

static void __cleanup_mnt(struct rcu_head *head)

1031

{

1031

{

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1032

cleanup_mnt(container_of(head, struct mount, mnt_rcu));

1033

}

1033

}

1034

1035

static LLIST_HEAD(delayed_mntput_list);

1035

static LLIST_HEAD(delayed_mntput_list);

1036

static void delayed_mntput(struct work_struct *unused)

1036

static void delayed_mntput(struct work_struct *unused)

1037

{

1037

{

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1038

struct llist_node *node = llist_del_all(&delayed_mntput_list);

1039

struct llist_node *next;

1039

struct llist_node *next;

1040

1041

for (; node; node = next) {

1041

for (; node; node = next) {

1042

next = llist_next(node);

1042

next = llist_next(node);

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1043

cleanup_mnt(llist_entry(node, struct mount, mnt_llist));

1044

}

1044

}

1045

}

1045

}

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1046

static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1047

1048

static void mntput_no_expire(struct mount *mnt)

1048

static void mntput_no_expire(struct mount *mnt)

1049

{

1049

{

1050

rcu_read_lock();

1050

rcu_read_lock();

1051

mnt_add_count(mnt, -1);

1051

mnt_add_count(mnt, -1);

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1052

if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */

1053

rcu_read_unlock();

1053

rcu_read_unlock();

1054

return;

1054

return;

1055

}

1055

}

1056

lock_mount_hash();

1056

lock_mount_hash();

1057

if (mnt_get_count(mnt)) {

1057

if (mnt_get_count(mnt)) {

1058

rcu_read_unlock();

1058

rcu_read_unlock();

1059

unlock_mount_hash();

1059

unlock_mount_hash();

1060

return;

1060

return;

1061

}

1061

}

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1062

if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {

1063

rcu_read_unlock();

1063

rcu_read_unlock();

1064

unlock_mount_hash();

1064

unlock_mount_hash();

1065

return;

1065

return;

1066

}

1066

}

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1067

mnt->mnt.mnt_flags |= MNT_DOOMED;

1068

rcu_read_unlock();

1068

rcu_read_unlock();

1069

1070

list_del(&mnt->mnt_instance);

1070

list_del(&mnt->mnt_instance);

1071

unlock_mount_hash();

1071

unlock_mount_hash();

1072

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1073

if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {

1074

struct task_struct *task = current;

1074

struct task_struct *task = current;

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1075

if (likely(!(task->flags & PF_KTHREAD))) {

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1076

init_task_work(&mnt->mnt_rcu, __cleanup_mnt);

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1077

if (!task_work_add(task, &mnt->mnt_rcu, true))

1078

return;

1078

return;

1079

}

1079

}

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1080

if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1081

schedule_delayed_work(&delayed_mntput_work, 1);

1082

return;

1082

return;

1083

}

1083

}

1084

cleanup_mnt(mnt);

1084

cleanup_mnt(mnt);

1085

}

1085

}

1086

1087

void mntput(struct vfsmount *mnt)

1087

void mntput(struct vfsmount *mnt)

1088

{

1088

{

1089

if (mnt) {

1089

if (mnt) {

1090

struct mount *m = real_mount(mnt);

1090

struct mount *m = real_mount(mnt);

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1091

/* avoid cacheline pingpong, hope gcc doesn't get "smart" */

1092

if (unlikely(m->mnt_expiry_mark))

1092

if (unlikely(m->mnt_expiry_mark))

1093

m->mnt_expiry_mark = 0;

1093

m->mnt_expiry_mark = 0;

1094

mntput_no_expire(m);

1094

mntput_no_expire(m);

1095

}

1095

}

1096

}

1096

}

1097

EXPORT_SYMBOL(mntput);

1097

EXPORT_SYMBOL(mntput);

1098

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1099

struct vfsmount *mntget(struct vfsmount *mnt)

1100

{

1100

{

1101

if (mnt)

1101

if (mnt)

1102

mnt_add_count(real_mount(mnt), 1);

1102

mnt_add_count(real_mount(mnt), 1);

1103

return mnt;

1103

return mnt;

1104

}

1104

}

1105

EXPORT_SYMBOL(mntget);

1105

EXPORT_SYMBOL(mntget);

1106

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1107

struct vfsmount *mnt_clone_internal(struct path *path)

1108

{

1108

{

1109

struct mount *p;

1109

struct mount *p;

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1110

p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);

1111

if (IS_ERR(p))

1111

if (IS_ERR(p))

1112

return ERR_CAST(p);

1112

return ERR_CAST(p);

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1113

p->mnt.mnt_flags |= MNT_INTERNAL;

1114

return &p->mnt;

1114

return &p->mnt;

1115

}

1115

}

1116

1117

static inline void mangle(struct seq_file *m, const char *s)

1117

static inline void mangle(struct seq_file *m, const char *s)

1118

{

1118

{

1119

seq_escape(m, s, " \t\n\\");

1119

seq_escape(m, s, " \t\n\\");

1120

}

1120

}

1121

1122

/*

1122

/*

1123

* Simple .show_options callback for filesystems which don't want to

1123

* Simple .show_options callback for filesystems which don't want to

1124

* implement more complex mount option showing.

1124

* implement more complex mount option showing.

1125

*

1125

*

1126

* See also save_mount_options().

1126

* See also save_mount_options().

1127

*/

1127

*/

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1128

int generic_show_options(struct seq_file *m, struct dentry *root)

1129

{

1129

{

1130

const char *options;

1130

const char *options;

1131

1132

rcu_read_lock();

1132

rcu_read_lock();

1133

options = rcu_dereference(root->d_sb->s_options);

1133

options = rcu_dereference(root->d_sb->s_options);

1134

1135

if (options != NULL && options[0]) {

1135

if (options != NULL && options[0]) {

1136

seq_putc(m, ',');

1136

seq_putc(m, ',');

1137

mangle(m, options);

1137

mangle(m, options);

1138

}

1138

}

1139

rcu_read_unlock();

1139

rcu_read_unlock();

1140

1141

return 0;

1141

return 0;

1142

}

1142

}

1143

EXPORT_SYMBOL(generic_show_options);

1143

EXPORT_SYMBOL(generic_show_options);

1144

1145

/*

1145

/*

1146

* If filesystem uses generic_show_options(), this function should be

1146

* If filesystem uses generic_show_options(), this function should be

1147

* called from the fill_super() callback.

1147

* called from the fill_super() callback.

1148

*

1148

*

1149

* The .remount_fs callback usually needs to be handled in a special

1149

* The .remount_fs callback usually needs to be handled in a special

1150

* way, to make sure, that previous options are not overwritten if the

1150

* way, to make sure, that previous options are not overwritten if the

1151

* remount fails.

1151

* remount fails.

1152

*

1152

*

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1153

* Also note, that if the filesystem's .remount_fs function doesn't

1154

* reset all options to their default value, but changes only newly

1154

* reset all options to their default value, but changes only newly

1155

* given options, then the displayed options will not reflect reality

1155

* given options, then the displayed options will not reflect reality

1156

* any more.

1156

* any more.

1157

*/

1157

*/

1158

void save_mount_options(struct super_block *sb, char *options)

1158

void save_mount_options(struct super_block *sb, char *options)

1159

{

1159

{

1160

BUG_ON(sb->s_options);

1160

BUG_ON(sb->s_options);

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1161

rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));

1162

}

1162

}

1163

EXPORT_SYMBOL(save_mount_options);

1163

EXPORT_SYMBOL(save_mount_options);

1164

1165

void replace_mount_options(struct super_block *sb, char *options)

1165

void replace_mount_options(struct super_block *sb, char *options)

1166

{

1166

{

1167

char *old = sb->s_options;

1167

char *old = sb->s_options;

1168

rcu_assign_pointer(sb->s_options, options);

1168

rcu_assign_pointer(sb->s_options, options);

1169

if (old) {

1169

if (old) {

1170

synchronize_rcu();

1170

synchronize_rcu();

1171

kfree(old);

1171

kfree(old);

1172

}

1172

}

1173

}

1173

}

1174

EXPORT_SYMBOL(replace_mount_options);

1174

EXPORT_SYMBOL(replace_mount_options);

1175

1176

#ifdef CONFIG_PROC_FS

1176

#ifdef CONFIG_PROC_FS

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1177

/* iterator; we want it to have access to namespace_sem, thus here... */

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1178

static void *m_start(struct seq_file *m, loff_t *pos)

1179

{

1179

{

1180

struct proc_mounts *p = proc_mounts(m);

1180

struct proc_mounts *p = proc_mounts(m);

1181

1182

down_read(&namespace_sem);

1182

down_read(&namespace_sem);

1183

if (p->cached_event == p->ns->event) {

1183

if (p->cached_event == p->ns->event) {

1184

void *v = p->cached_mount;

1184

void *v = p->cached_mount;

1185

if (*pos == p->cached_index)

1185

if (*pos == p->cached_index)

1186

return v;

1186

return v;

1187

if (*pos == p->cached_index + 1) {

1187

if (*pos == p->cached_index + 1) {

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1188

v = seq_list_next(v, &p->ns->list, &p->cached_index);

1189

return p->cached_mount = v;

1189

return p->cached_mount = v;

1190

}

1190

}

1191

}

1191

}

1192

1193

p->cached_event = p->ns->event;

1193

p->cached_event = p->ns->event;

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1194

p->cached_mount = seq_list_start(&p->ns->list, *pos);

1195

p->cached_index = *pos;

1195

p->cached_index = *pos;

1196

return p->cached_mount;

1196

return p->cached_mount;

1197

}

1197

}

1198

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1199

static void *m_next(struct seq_file *m, void *v, loff_t *pos)

1200

{

1200

{

1201

struct proc_mounts *p = proc_mounts(m);

1201

struct proc_mounts *p = proc_mounts(m);

1202

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1203

p->cached_mount = seq_list_next(v, &p->ns->list, pos);

1204

p->cached_index = *pos;

1204

p->cached_index = *pos;

1205

return p->cached_mount;

1205

return p->cached_mount;

1206

}

1206

}

1207

1208

static void m_stop(struct seq_file *m, void *v)

1208

static void m_stop(struct seq_file *m, void *v)

1209

{

1209

{

1210

up_read(&namespace_sem);

1210

up_read(&namespace_sem);

1211

}

1211

}

1212

1213

static int m_show(struct seq_file *m, void *v)

1213

static int m_show(struct seq_file *m, void *v)

1214

{

1214

{

1215

struct proc_mounts *p = proc_mounts(m);

1215

struct proc_mounts *p = proc_mounts(m);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1216

struct mount *r = list_entry(v, struct mount, mnt_list);

1217

return p->show(m, &r->mnt);

1217

return p->show(m, &r->mnt);

1218

}

1218

}

1219

1220

const struct seq_operations mounts_op = {

1220

const struct seq_operations mounts_op = {

1221

.start = m_start,

1221

.start = m_start,

1222

.next = m_next,

1222

.next = m_next,

1223

.stop = m_stop,

1223

.stop = m_stop,

1224

.show = m_show,

1224

.show = m_show,

1225

};

1225

};

1226

#endif /* CONFIG_PROC_FS */

1226

#endif /* CONFIG_PROC_FS */

1227

1228

/**

1228

/**

1229

* may_umount_tree - check if a mount tree is busy

1229

* may_umount_tree - check if a mount tree is busy

1230

* @mnt: root of mount tree

1230

* @mnt: root of mount tree

1231

*

1231

*

1232

* This is called to check if a tree of mounts has any

1232

* This is called to check if a tree of mounts has any

1233

* open files, pwds, chroots or sub mounts that are

1233

* open files, pwds, chroots or sub mounts that are

1234

* busy.

1234

* busy.

1235

*/

1235

*/

1236

int may_umount_tree(struct vfsmount *m)

1236

int may_umount_tree(struct vfsmount *m)

1237

{

1237

{

1238

struct mount *mnt = real_mount(m);

1238

struct mount *mnt = real_mount(m);

1239

int actual_refs = 0;

1239

int actual_refs = 0;

1240

int minimum_refs = 0;

1240

int minimum_refs = 0;

1241

struct mount *p;

1241

struct mount *p;

1242

BUG_ON(!m);

1242

BUG_ON(!m);

1243

1244

/* write lock needed for mnt_get_count */

1244

/* write lock needed for mnt_get_count */

1245

lock_mount_hash();

1245

lock_mount_hash();

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1246

for (p = mnt; p; p = next_mnt(p, mnt)) {

1247

actual_refs += mnt_get_count(p);

1247

actual_refs += mnt_get_count(p);

1248

minimum_refs += 2;

1248

minimum_refs += 2;

1249

}

1249

}

1250

unlock_mount_hash();

1250

unlock_mount_hash();

1251

1252

if (actual_refs > minimum_refs)

1252

if (actual_refs > minimum_refs)

1253

return 0;

1253

return 0;

1254

1255

return 1;

1255

return 1;

1256

}

1256

}

1257

1258

EXPORT_SYMBOL(may_umount_tree);

1258

EXPORT_SYMBOL(may_umount_tree);

1259

1260

/**

1260

/**

1261

* may_umount - check if a mount point is busy

1261

* may_umount - check if a mount point is busy

1262

* @mnt: root of mount

1262

* @mnt: root of mount

1263

*

1263

*

1264

* This is called to check if a mount point has any

1264

* This is called to check if a mount point has any

1265

* open files, pwds, chroots or sub mounts. If the

1265

* open files, pwds, chroots or sub mounts. If the

1266

* mount has sub mounts this will return busy

1266

* mount has sub mounts this will return busy

1267

* regardless of whether the sub mounts are busy.

1267

* regardless of whether the sub mounts are busy.

1268

*

1268

*

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1269

* Doesn't take quota and stuff into account. IOW, in some cases it will

1270

* give false negatives. The main reason why it's here is that we need

1270

* give false negatives. The main reason why it's here is that we need

1271

* a non-destructive way to look for easily umountable filesystems.

1271

* a non-destructive way to look for easily umountable filesystems.

1272

*/

1272

*/

1273

int may_umount(struct vfsmount *mnt)

1273

int may_umount(struct vfsmount *mnt)

1274

{

1274

{

1275

int ret = 1;

1275

int ret = 1;

1276

down_read(&namespace_sem);

1276

down_read(&namespace_sem);

1277

lock_mount_hash();

1277

lock_mount_hash();

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1278

if (propagate_mount_busy(real_mount(mnt), 2))

1279

ret = 0;

1279

ret = 0;

1280

unlock_mount_hash();

1280

unlock_mount_hash();

1281

up_read(&namespace_sem);

1281

up_read(&namespace_sem);

1282

return ret;

1282

return ret;

1283

}

1283

}

1284

1285

EXPORT_SYMBOL(may_umount);

1285

EXPORT_SYMBOL(may_umount);

1286

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1287

static HLIST_HEAD(unmounted); /* protected by namespace_sem */

1288

1289

static void namespace_unlock(void)

1289

static void namespace_unlock(void)

1290

{

1290

{

1291

struct mount *mnt;

1291

struct mount *mnt;

1292

struct hlist_head head = unmounted;

1292

struct hlist_head head = unmounted;

1293

1294

if (likely(hlist_empty(&head))) {

1294

if (likely(hlist_empty(&head))) {

1295

up_write(&namespace_sem);

1295

up_write(&namespace_sem);

1296

return;

1296

return;

1297

}

1297

}

1298

1299

head.first->pprev = &head.first;

1299

head.first->pprev = &head.first;

1300

INIT_HLIST_HEAD(&unmounted);

1300

INIT_HLIST_HEAD(&unmounted);

1301

1302

/* undo decrements we'd done in umount_tree() */

1302

/* undo decrements we'd done in umount_tree() */

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1303

hlist_for_each_entry(mnt, &head, mnt_hash)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1304

if (mnt->mnt_ex_mountpoint.mnt)

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1305

mntget(mnt->mnt_ex_mountpoint.mnt);

1306

1307

up_write(&namespace_sem);

1307

up_write(&namespace_sem);

1308

1309

synchronize_rcu();

1309

synchronize_rcu();

1310

1311

while (!hlist_empty(&head)) {

1311

while (!hlist_empty(&head)) {

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1312

mnt = hlist_entry(head.first, struct mount, mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1313

hlist_del_init(&mnt->mnt_hash);

1314

if (mnt->mnt_ex_mountpoint.mnt)

1314

if (mnt->mnt_ex_mountpoint.mnt)

1315

path_put(&mnt->mnt_ex_mountpoint);

1315

path_put(&mnt->mnt_ex_mountpoint);

1316

mntput(&mnt->mnt);

1316

mntput(&mnt->mnt);

1317

}

1317

}

1318

}

1318

}

1319

1320

static inline void namespace_lock(void)

1320

static inline void namespace_lock(void)

1321

{

1321

{

1322

down_write(&namespace_sem);

1322

down_write(&namespace_sem);

1323

}

1323

}

1324

1325

/*

1325

/*

1326

* mount_lock must be held

1326

* mount_lock must be held

1327

* namespace_sem must be held for write

1327

* namespace_sem must be held for write

1328

* how = 0 => just this tree, don't propagate

1328

* how = 0 => just this tree, don't propagate

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1329

* how = 1 => propagate; we know that nobody else has reference to any victims

1330

* how = 2 => lazy umount

1330

* how = 2 => lazy umount

1331

*/

1331

*/

1332

void umount_tree(struct mount *mnt, int how)

1332

void umount_tree(struct mount *mnt, int how)

1333

{

1333

{

1334

HLIST_HEAD(tmp_list);

1334

HLIST_HEAD(tmp_list);

1335

struct mount *p;

1335

struct mount *p;

1336

struct mount *last = NULL;

1336

struct mount *last = NULL;

1337

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1338

for (p = mnt; p; p = next_mnt(p, mnt)) {

1339

hlist_del_init_rcu(&p->mnt_hash);

1339

hlist_del_init_rcu(&p->mnt_hash);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1340

hlist_add_head(&p->mnt_hash, &tmp_list);

1341

}

1341

}

1342

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1343

hlist_for_each_entry(p, &tmp_list, mnt_hash)

1344

list_del_init(&p->mnt_child);

1344

list_del_init(&p->mnt_child);

1345

1346

if (how)

1346

if (how)

1347

propagate_umount(&tmp_list);

1347

propagate_umount(&tmp_list);

1348

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1349

hlist_for_each_entry(p, &tmp_list, mnt_hash) {

1350

list_del_init(&p->mnt_expire);

1350

list_del_init(&p->mnt_expire);

1351

list_del_init(&p->mnt_list);

1351

list_del_init(&p->mnt_list);

1352

__touch_mnt_namespace(p->mnt_ns);

1352

__touch_mnt_namespace(p->mnt_ns);

1353

p->mnt_ns = NULL;

1353

p->mnt_ns = NULL;

1354

if (how < 2)

1354

if (how < 2)

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1355

p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

1356

if (mnt_has_parent(p)) {

1356

if (mnt_has_parent(p)) {

1357

hlist_del_init(&p->mnt_mp_list);

1357

hlist_del_init(&p->mnt_mp_list);

1358

put_mountpoint(p->mnt_mp);

1358

put_mountpoint(p->mnt_mp);

1359

mnt_add_count(p->mnt_parent, -1);

1359

mnt_add_count(p->mnt_parent, -1);

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1360

/* move the reference to mountpoint into ->mnt_ex_mountpoint */

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1361

p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1362

p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1363

p->mnt_mountpoint = p->mnt.mnt_root;

1364

p->mnt_parent = p;

1364

p->mnt_parent = p;

1365

p->mnt_mp = NULL;

1365

p->mnt_mp = NULL;

1366

}

1366

}

1367

change_mnt_propagation(p, MS_PRIVATE);

1367

change_mnt_propagation(p, MS_PRIVATE);

1368

last = p;

1368

last = p;

1369

}

1369

}

1370

if (last) {

1370

if (last) {

1371

last->mnt_hash.next = unmounted.first;

1371

last->mnt_hash.next = unmounted.first;

1372

unmounted.first = tmp_list.first;

1372

unmounted.first = tmp_list.first;

1373

unmounted.first->pprev = &unmounted.first;

1373

unmounted.first->pprev = &unmounted.first;

1374

}

1374

}

1375

}

1375

}

1376

1377

static void shrink_submounts(struct mount *mnt);

1377

static void shrink_submounts(struct mount *mnt);

1378

1379

static int do_umount(struct mount *mnt, int flags)

1379

static int do_umount(struct mount *mnt, int flags)

1380

{

1380

{

1381

struct super_block *sb = mnt->mnt.mnt_sb;

1381

struct super_block *sb = mnt->mnt.mnt_sb;

1382

int retval;

1382

int retval;

1383

1384

retval = security_sb_umount(&mnt->mnt, flags);

1384

retval = security_sb_umount(&mnt->mnt, flags);

1385

if (retval)

1385

if (retval)

1386

return retval;

1386

return retval;

1387

1388

/*

1388

/*

1389

* Allow userspace to request a mountpoint be expired rather than

1389

* Allow userspace to request a mountpoint be expired rather than

1390

* unmounting unconditionally. Unmount only happens if:

1390

* unmounting unconditionally. Unmount only happens if:

1391

* (1) the mark is already set (the mark is cleared by mntput())

1391

* (1) the mark is already set (the mark is cleared by mntput())

1392

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1392

* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]

1393

*/

1393

*/

1394

if (flags & MNT_EXPIRE) {

1394

if (flags & MNT_EXPIRE) {

1395

if (&mnt->mnt == current->fs->root.mnt ||

1395

if (&mnt->mnt == current->fs->root.mnt ||

1396

flags & (MNT_FORCE | MNT_DETACH))

1396

flags & (MNT_FORCE | MNT_DETACH))

1397

return -EINVAL;

1397

return -EINVAL;

1398

1399

/*

1399

/*

1400

* probably don't strictly need the lock here if we examined

1400

* probably don't strictly need the lock here if we examined

1401

* all race cases, but it's a slowpath.

1401

* all race cases, but it's a slowpath.

1402

*/

1402

*/

1403

lock_mount_hash();

1403

lock_mount_hash();

1404

if (mnt_get_count(mnt) != 2) {

1404

if (mnt_get_count(mnt) != 2) {

1405

unlock_mount_hash();

1405

unlock_mount_hash();

1406

return -EBUSY;

1406

return -EBUSY;

1407

}

1407

}

1408

unlock_mount_hash();

1408

unlock_mount_hash();

1409

1410

if (!xchg(&mnt->mnt_expiry_mark, 1))

1410

if (!xchg(&mnt->mnt_expiry_mark, 1))

1411

return -EAGAIN;

1411

return -EAGAIN;

1412

}

1412

}

1413

1414

/*

1414

/*

1415

* If we may have to abort operations to get out of this

1415

* If we may have to abort operations to get out of this

1416

* mount, and they will themselves hold resources we must

1416

* mount, and they will themselves hold resources we must

1417

* allow the fs to do things. In the Unix tradition of

1417

* allow the fs to do things. In the Unix tradition of

1418

* 'Gee thats tricky lets do it in userspace' the umount_begin

1418

* 'Gee thats tricky lets do it in userspace' the umount_begin

1419

* might fail to complete on the first run through as other tasks

1419

* might fail to complete on the first run through as other tasks

1420

* must return, and the like. Thats for the mount program to worry

1420

* must return, and the like. Thats for the mount program to worry

1421

* about for the moment.

1421

* about for the moment.

1422

*/

1422

*/

1423

1424

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1424

if (flags & MNT_FORCE && sb->s_op->umount_begin) {

1425

sb->s_op->umount_begin(sb);

1425

sb->s_op->umount_begin(sb);

1426

}

1426

}

1427

1428

/*

1428

/*

1429

* No sense to grab the lock for this test, but test itself looks

1429

* No sense to grab the lock for this test, but test itself looks

1430

* somewhat bogus. Suggestions for better replacement?

1430

* somewhat bogus. Suggestions for better replacement?

1431

* Ho-hum... In principle, we might treat that as umount + switch

1431

* Ho-hum... In principle, we might treat that as umount + switch

1432

* to rootfs. GC would eventually take care of the old vfsmount.

1432

* to rootfs. GC would eventually take care of the old vfsmount.

1433

* Actually it makes sense, especially if rootfs would contain a

1433

* Actually it makes sense, especially if rootfs would contain a

1434

* /reboot - static binary that would close all descriptors and

1434

* /reboot - static binary that would close all descriptors and

1435

* call reboot(9). Then init(8) could umount root and exec /reboot.

1435

* call reboot(9). Then init(8) could umount root and exec /reboot.

1436

*/

1436

*/

1437

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1437

if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {

1438

/*

1438

/*

1439

* Special case for "unmounting" root ...

1439

* Special case for "unmounting" root ...

1440

* we just try to remount it readonly.

1440

* we just try to remount it readonly.

1441

*/

1441

*/

1442

if (!capable(CAP_SYS_ADMIN))

1442

if (!capable(CAP_SYS_ADMIN))

1443

return -EPERM;

1443

return -EPERM;

1444

down_write(&sb->s_umount);

1444

down_write(&sb->s_umount);

1445

if (!(sb->s_flags & MS_RDONLY))

1445

if (!(sb->s_flags & MS_RDONLY))

1446

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1446

retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);

1447

up_write(&sb->s_umount);

1447

up_write(&sb->s_umount);

1448

return retval;

1448

return retval;

1449

}

1449

}

1450

1451

namespace_lock();

1451

namespace_lock();

1452

lock_mount_hash();

1452

lock_mount_hash();

1453

event++;

1453

event++;

1454

1455

if (flags & MNT_DETACH) {

1455

if (flags & MNT_DETACH) {

1456

if (!list_empty(&mnt->mnt_list))

1456

if (!list_empty(&mnt->mnt_list))

1457

umount_tree(mnt, 2);

1457

umount_tree(mnt, 2);

1458

retval = 0;

1458

retval = 0;

1459

} else {

1459

} else {

1460

shrink_submounts(mnt);

1460

shrink_submounts(mnt);

1461

retval = -EBUSY;

1461

retval = -EBUSY;

1462

if (!propagate_mount_busy(mnt, 2)) {

1462

if (!propagate_mount_busy(mnt, 2)) {

1463

if (!list_empty(&mnt->mnt_list))

1463

if (!list_empty(&mnt->mnt_list))

1464

umount_tree(mnt, 1);

1464

umount_tree(mnt, 1);

1465

retval = 0;

1465

retval = 0;

1466

}

1466

}

1467

}

1467

}

1468

unlock_mount_hash();

1468

unlock_mount_hash();

1469

namespace_unlock();

1469

namespace_unlock();

1470

return retval;

1470

return retval;

1471

}

1471

}

1472

1473

/*

1473

/*

1474

* __detach_mounts - lazily unmount all mounts on the specified dentry

1474

* __detach_mounts - lazily unmount all mounts on the specified dentry

1475

*

1475

*

1476

* During unlink, rmdir, and d_drop it is possible to loose the path

1476

* During unlink, rmdir, and d_drop it is possible to loose the path

1477

* to an existing mountpoint, and wind up leaking the mount.

1477

* to an existing mountpoint, and wind up leaking the mount.

1478

* detach_mounts allows lazily unmounting those mounts instead of

1478

* detach_mounts allows lazily unmounting those mounts instead of

1479

* leaking them.

1479

* leaking them.

1480

*

1480

*

1481

* The caller may hold dentry->d_inode->i_mutex.

1481

* The caller may hold dentry->d_inode->i_mutex.

1482

*/

1482

*/

1483

void __detach_mounts(struct dentry *dentry)

1483

void __detach_mounts(struct dentry *dentry)

1484

{

1484

{

1485

struct mountpoint *mp;

1485

struct mountpoint *mp;

1486

struct mount *mnt;

1486

struct mount *mnt;

1487

1488

namespace_lock();

1488

namespace_lock();

1489

mp = lookup_mountpoint(dentry);

1489

mp = lookup_mountpoint(dentry);

1490

if (!mp)

1490

if (!mp)

1491

goto out_unlock;

1491

goto out_unlock;

1492

1493

lock_mount_hash();

1493

lock_mount_hash();

1494

while (!hlist_empty(&mp->m_list)) {

1494

while (!hlist_empty(&mp->m_list)) {

1495

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1495

mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);

1496

umount_tree(mnt, 2);

1496

umount_tree(mnt, 2);

1497

}

1497

}

1498

unlock_mount_hash();

1498

unlock_mount_hash();

1499

put_mountpoint(mp);

1499

put_mountpoint(mp);

1500

out_unlock:

1500

out_unlock:

1501

namespace_unlock();

1501

namespace_unlock();

1502

}

1502

}

1503

1504

/*

1504

/*

1505

* Is the caller allowed to modify his namespace?

1505

* Is the caller allowed to modify his namespace?

1506

*/

1506

*/

1507

static inline bool may_mount(void)

1507

static inline bool may_mount(void)

1508

{

1508

{

1509

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1509

return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);

1510

}

1510

}

1511

1512

/*

1512

/*

1513

* Now umount can handle mount points as well as block devices.

1513

* Now umount can handle mount points as well as block devices.

1514

* This is important for filesystems which use unnamed block devices.

1514

* This is important for filesystems which use unnamed block devices.

1515

*

1515

*

1516

* We now support a flag for forced unmount like the other 'big iron'

1516

* We now support a flag for forced unmount like the other 'big iron'

1517

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1517

* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD

1518

*/

1518

*/

1519

1520

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1520

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)

1521

{

1521

{

1522

struct path path;

1522

struct path path;

1523

struct mount *mnt;

1523

struct mount *mnt;

1524

int retval;

1524

int retval;

1525

int lookup_flags = 0;

1525

int lookup_flags = 0;

1526

1527

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1527

if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))

1528

return -EINVAL;

1528

return -EINVAL;

1529

1530

if (!may_mount())

1530

if (!may_mount())

1531

return -EPERM;

1531

return -EPERM;

1532

1533

if (!(flags & UMOUNT_NOFOLLOW))

1533

if (!(flags & UMOUNT_NOFOLLOW))

1534

lookup_flags |= LOOKUP_FOLLOW;

1534

lookup_flags |= LOOKUP_FOLLOW;

1535

1536

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1536

retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);

1537

if (retval)

1537

if (retval)

1538

goto out;

1538

goto out;

1539

mnt = real_mount(path.mnt);

1539

mnt = real_mount(path.mnt);

1540

retval = -EINVAL;

1540

retval = -EINVAL;

1541

if (path.dentry != path.mnt->mnt_root)

1541

if (path.dentry != path.mnt->mnt_root)

1542

goto dput_and_out;

1542

goto dput_and_out;

1543

if (!check_mnt(mnt))

1543

if (!check_mnt(mnt))

1544

goto dput_and_out;

1544

goto dput_and_out;

1545

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1545

if (mnt->mnt.mnt_flags & MNT_LOCKED)

1546

goto dput_and_out;

1546

goto dput_and_out;

1547

1548

retval = do_umount(mnt, flags);

1548

retval = do_umount(mnt, flags);

1549

dput_and_out:

1549

dput_and_out:

1550

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1550

/* we mustn't call path_put() as that would clear mnt_expiry_mark */

1551

dput(path.dentry);

1551

dput(path.dentry);

1552

mntput_no_expire(mnt);

1552

mntput_no_expire(mnt);

1553

out:

1553

out:

1554

return retval;

1554

return retval;

1555

}

1555

}

1556

1557

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1557

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

1558

1559

/*

1559

/*

1560

* The 2.0 compatible umount. No flags.

1560

* The 2.0 compatible umount. No flags.

1561

*/

1561

*/

1562

SYSCALL_DEFINE1(oldumount, char __user *, name)

1562

SYSCALL_DEFINE1(oldumount, char __user *, name)

1563

{

1563

{

1564

return sys_umount(name, 0);

1564

return sys_umount(name, 0);

1565

}

1565

}

1566

1567

#endif

1567

#endif

1568

1569

static bool is_mnt_ns_file(struct dentry *dentry)

1569

static bool is_mnt_ns_file(struct dentry *dentry)

1570

{

1570

{

1571

/* Is this a proxy for a mount namespace? */

1571

/* Is this a proxy for a mount namespace? */

1572

struct inode *inode = dentry->d_inode;

1572

return dentry->d_op == &ns_dentry_operations &&

1573

return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations;

1573

dentry->d_fsdata == &mntns_operations;

1574

}

1574

}

1575

1576

struct mnt_namespace *to_mnt_ns(struct ns_common *ns)

1576

struct mnt_namespace *to_mnt_ns(struct ns_common *ns)

1577

{

1577

{

1578

return container_of(ns, struct mnt_namespace, ns);

1578

return container_of(ns, struct mnt_namespace, ns);

1579

}

1579

}

1580

1581

static bool mnt_ns_loop(struct dentry *dentry)

1581

static bool mnt_ns_loop(struct dentry *dentry)

1582

{

1582

{

1583

/* Could bind mounting the mount namespace inode cause a

1583

/* Could bind mounting the mount namespace inode cause a

1584

* mount namespace loop?

1584

* mount namespace loop?

1585

*/

1585

*/

1586

struct mnt_namespace *mnt_ns;

1586

struct mnt_namespace *mnt_ns;

1587

if (!is_mnt_ns_file(dentry))

1587

if (!is_mnt_ns_file(dentry))

1588

return false;

1588

return false;

1589

1590

mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));

1590

mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));

1591

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1591

return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;

1592

}

1592

}

1593

1594

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1594

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,

1595

int flag)

1595

int flag)

1596

{

1596

{

1597

struct mount *res, *p, *q, *r, *parent;

1597

struct mount *res, *p, *q, *r, *parent;

1598

1599

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1599

if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))

1600

return ERR_PTR(-EINVAL);

1600

return ERR_PTR(-EINVAL);

1601

1602

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1602

if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))

1603

return ERR_PTR(-EINVAL);

1603

return ERR_PTR(-EINVAL);

1604

1605

res = q = clone_mnt(mnt, dentry, flag);

1605

res = q = clone_mnt(mnt, dentry, flag);

1606

if (IS_ERR(q))

1606

if (IS_ERR(q))

1607

return q;

1607

return q;

1608

1609

q->mnt.mnt_flags &= ~MNT_LOCKED;

1609

q->mnt.mnt_flags &= ~MNT_LOCKED;

1610

q->mnt_mountpoint = mnt->mnt_mountpoint;

1610

q->mnt_mountpoint = mnt->mnt_mountpoint;

1611

1612

p = mnt;

1612

p = mnt;

1613

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1613

list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {

1614

struct mount *s;

1614

struct mount *s;

1615

if (!is_subdir(r->mnt_mountpoint, dentry))

1615

if (!is_subdir(r->mnt_mountpoint, dentry))

1616

continue;

1616

continue;

1617

1618

for (s = r; s; s = next_mnt(s, r)) {

1618

for (s = r; s; s = next_mnt(s, r)) {

1619

struct mount *t = NULL;

1619

struct mount *t = NULL;

1620

if (!(flag & CL_COPY_UNBINDABLE) &&

1620

if (!(flag & CL_COPY_UNBINDABLE) &&

1621

IS_MNT_UNBINDABLE(s)) {

1621

IS_MNT_UNBINDABLE(s)) {

1622

s = skip_mnt_tree(s);

1622

s = skip_mnt_tree(s);

1623

continue;

1623

continue;

1624

}

1624

}

1625

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1625

if (!(flag & CL_COPY_MNT_NS_FILE) &&

1626

is_mnt_ns_file(s->mnt.mnt_root)) {

1626

is_mnt_ns_file(s->mnt.mnt_root)) {

1627

s = skip_mnt_tree(s);

1627

s = skip_mnt_tree(s);

1628

continue;

1628

continue;

1629

}

1629

}

1630

while (p != s->mnt_parent) {

1630

while (p != s->mnt_parent) {

1631

p = p->mnt_parent;

1631

p = p->mnt_parent;

1632

q = q->mnt_parent;

1632

q = q->mnt_parent;

1633

}

1633

}

1634

p = s;

1634

p = s;

1635

parent = q;

1635

parent = q;

1636

q = clone_mnt(p, p->mnt.mnt_root, flag);

1636

q = clone_mnt(p, p->mnt.mnt_root, flag);

1637

if (IS_ERR(q))

1637

if (IS_ERR(q))

1638

goto out;

1638

goto out;

1639

lock_mount_hash();

1639

lock_mount_hash();

1640

list_add_tail(&q->mnt_list, &res->mnt_list);

1640

list_add_tail(&q->mnt_list, &res->mnt_list);

1641

mnt_set_mountpoint(parent, p->mnt_mp, q);

1641

mnt_set_mountpoint(parent, p->mnt_mp, q);

1642

if (!list_empty(&parent->mnt_mounts)) {

1642

if (!list_empty(&parent->mnt_mounts)) {

1643

t = list_last_entry(&parent->mnt_mounts,

1643

t = list_last_entry(&parent->mnt_mounts,

1644

struct mount, mnt_child);

1644

struct mount, mnt_child);

1645

if (t->mnt_mp != p->mnt_mp)

1645

if (t->mnt_mp != p->mnt_mp)

1646

t = NULL;

1646

t = NULL;

1647

}

1647

}

1648

attach_shadowed(q, parent, t);

1648

attach_shadowed(q, parent, t);

1649

unlock_mount_hash();

1649

unlock_mount_hash();

1650

}

1650

}

1651

}

1651

}

1652

return res;

1652

return res;

1653

out:

1653

out:

1654

if (res) {

1654

if (res) {

1655

lock_mount_hash();

1655

lock_mount_hash();

1656

umount_tree(res, 0);

1656

umount_tree(res, 0);

1657

unlock_mount_hash();

1657

unlock_mount_hash();

1658

}

1658

}

1659

return q;

1659

return q;

1660

}

1660

}

1661

1662

/* Caller should check returned pointer for errors */

1662

/* Caller should check returned pointer for errors */

1663

1664

struct vfsmount *collect_mounts(struct path *path)

1664

struct vfsmount *collect_mounts(struct path *path)

1665

{

1665

{

1666

struct mount *tree;

1666

struct mount *tree;

1667

namespace_lock();

1667

namespace_lock();

1668

tree = copy_tree(real_mount(path->mnt), path->dentry,

1668

tree = copy_tree(real_mount(path->mnt), path->dentry,

1669

CL_COPY_ALL | CL_PRIVATE);

1669

CL_COPY_ALL | CL_PRIVATE);

1670

namespace_unlock();

1670

namespace_unlock();

1671

if (IS_ERR(tree))

1671

if (IS_ERR(tree))

1672

return ERR_CAST(tree);

1672

return ERR_CAST(tree);

1673

return &tree->mnt;

1673

return &tree->mnt;

1674

}

1674

}

1675

1676

void drop_collected_mounts(struct vfsmount *mnt)

1676

void drop_collected_mounts(struct vfsmount *mnt)

1677

{

1677

{

1678

namespace_lock();

1678

namespace_lock();

1679

lock_mount_hash();

1679

lock_mount_hash();

1680

umount_tree(real_mount(mnt), 0);

1680

umount_tree(real_mount(mnt), 0);

1681

unlock_mount_hash();

1681

unlock_mount_hash();

1682

namespace_unlock();

1682

namespace_unlock();

1683

}

1683

}

1684

1685

/**

1685

/**

1686

* clone_private_mount - create a private clone of a path

1686

* clone_private_mount - create a private clone of a path

1687

*

1687

*

1688

* This creates a new vfsmount, which will be the clone of @path. The new will

1688

* This creates a new vfsmount, which will be the clone of @path. The new will

1689

* not be attached anywhere in the namespace and will be private (i.e. changes

1689

* not be attached anywhere in the namespace and will be private (i.e. changes

1690

* to the originating mount won't be propagated into this).

1690

* to the originating mount won't be propagated into this).

1691

*

1691

*

1692

* Release with mntput().

1692

* Release with mntput().

1693

*/

1693

*/

1694

struct vfsmount *clone_private_mount(struct path *path)

1694

struct vfsmount *clone_private_mount(struct path *path)

1695

{

1695

{

1696

struct mount *old_mnt = real_mount(path->mnt);

1696

struct mount *old_mnt = real_mount(path->mnt);

1697

struct mount *new_mnt;

1697

struct mount *new_mnt;

1698

1699

if (IS_MNT_UNBINDABLE(old_mnt))

1699

if (IS_MNT_UNBINDABLE(old_mnt))

1700

return ERR_PTR(-EINVAL);

1700

return ERR_PTR(-EINVAL);

1701

1702

down_read(&namespace_sem);

1702

down_read(&namespace_sem);

1703

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1703

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);

1704

up_read(&namespace_sem);

1704

up_read(&namespace_sem);

1705

if (IS_ERR(new_mnt))

1705

if (IS_ERR(new_mnt))

1706

return ERR_CAST(new_mnt);

1706

return ERR_CAST(new_mnt);

1707

1708

return &new_mnt->mnt;

1708

return &new_mnt->mnt;

1709

}

1709

}

1710

EXPORT_SYMBOL_GPL(clone_private_mount);

1710

EXPORT_SYMBOL_GPL(clone_private_mount);

1711

1712

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1712

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,

1713

struct vfsmount *root)

1713

struct vfsmount *root)

1714

{

1714

{

1715

struct mount *mnt;

1715

struct mount *mnt;

1716

int res = f(root, arg);

1716

int res = f(root, arg);

1717

if (res)

1717

if (res)

1718

return res;

1718

return res;

1719

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1719

list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {

1720

res = f(&mnt->mnt, arg);

1720

res = f(&mnt->mnt, arg);

1721

if (res)

1721

if (res)

1722

return res;

1722

return res;

1723

}

1723

}

1724

return 0;

1724

return 0;

1725

}

1725

}

1726

1727

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1727

static void cleanup_group_ids(struct mount *mnt, struct mount *end)

1728

{

1728

{

1729

struct mount *p;

1729

struct mount *p;

1730

1731

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1731

for (p = mnt; p != end; p = next_mnt(p, mnt)) {

1732

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1732

if (p->mnt_group_id && !IS_MNT_SHARED(p))

1733

mnt_release_group_id(p);

1733

mnt_release_group_id(p);

1734

}

1734

}

1735

}

1735

}

1736

1737

static int invent_group_ids(struct mount *mnt, bool recurse)

1737

static int invent_group_ids(struct mount *mnt, bool recurse)

1738

{

1738

{

1739

struct mount *p;

1739

struct mount *p;

1740

1741

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1741

for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {

1742

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1742

if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {

1743

int err = mnt_alloc_group_id(p);

1743

int err = mnt_alloc_group_id(p);

1744

if (err) {

1744

if (err) {

1745

cleanup_group_ids(mnt, p);

1745

cleanup_group_ids(mnt, p);

1746

return err;

1746

return err;

1747

}

1747

}

1748

}

1748

}

1749

}

1749

}

1750

1751

return 0;

1751

return 0;

1752

}

1752

}

1753

1754

/*

1754

/*

1755

* @source_mnt : mount tree to be attached

1755

* @source_mnt : mount tree to be attached

1756

* @nd : place the mount tree @source_mnt is attached

1756

* @nd : place the mount tree @source_mnt is attached

1757

* @parent_nd : if non-null, detach the source_mnt from its parent and

1757

* @parent_nd : if non-null, detach the source_mnt from its parent and

1758

* store the parent mount and mountpoint dentry.

1758

* store the parent mount and mountpoint dentry.

1759

* (done when source_mnt is moved)

1759

* (done when source_mnt is moved)

1760

*

1760

*

1761

* NOTE: in the table below explains the semantics when a source mount

1761

* NOTE: in the table below explains the semantics when a source mount

1762

* of a given type is attached to a destination mount of a given type.

1762

* of a given type is attached to a destination mount of a given type.

1763

* ---------------------------------------------------------------------------

1763

* ---------------------------------------------------------------------------

1764

* | BIND MOUNT OPERATION |

1764

* | BIND MOUNT OPERATION |

1765

* |**************************************************************************

1765

* |**************************************************************************

1766

1766

1767

* | dest | | | | |

1767

* | dest | | | | |

1768

* | | | | | | |

1768

* | | | | | | |

1769

* | v | | | | |

1769

* | v | | | | |

1770

* |**************************************************************************

1770

* |**************************************************************************

1771

1771

1772

* | | | | | |

1772

* | | | | | |

1773

1773

1774

* ***************************************************************************

1774

* ***************************************************************************

1775

* A bind operation clones the source mount and mounts the clone on the

1775

* A bind operation clones the source mount and mounts the clone on the

1776

* destination mount.

1776

* destination mount.

1777

*

1777

*

1778

* (++) the cloned mount is propagated to all the mounts in the propagation

1778

* (++) the cloned mount is propagated to all the mounts in the propagation

1779

* tree of the destination mount and the cloned mount is added to

1779

* tree of the destination mount and the cloned mount is added to

1780

* the peer group of the source mount.

1780

* the peer group of the source mount.

1781

* (+) the cloned mount is created under the destination mount and is marked

1781

* (+) the cloned mount is created under the destination mount and is marked

1782

* as shared. The cloned mount is added to the peer group of the source

1782

* as shared. The cloned mount is added to the peer group of the source

1783

* mount.

1783

* mount.

1784

* (+++) the mount is propagated to all the mounts in the propagation tree

1784

* (+++) the mount is propagated to all the mounts in the propagation tree

1785

* of the destination mount and the cloned mount is made slave

1785

* of the destination mount and the cloned mount is made slave

1786

* of the same master as that of the source mount. The cloned mount

1786

* of the same master as that of the source mount. The cloned mount

1787

* is marked as 'shared and slave'.

1787

* is marked as 'shared and slave'.

1788

* (*) the cloned mount is made a slave of the same master as that of the

1788

* (*) the cloned mount is made a slave of the same master as that of the

1789

* source mount.

1789

* source mount.

1790

*

1790

*

1791

* ---------------------------------------------------------------------------

1791

* ---------------------------------------------------------------------------

1792

* | MOVE MOUNT OPERATION |

1792

* | MOVE MOUNT OPERATION |

1793

* |**************************************************************************

1793

* |**************************************************************************

1794

1794

1795

* | dest | | | | |

1795

* | dest | | | | |

1796

* | | | | | | |

1796

* | | | | | | |

1797

* | v | | | | |

1797

* | v | | | | |

1798

* |**************************************************************************

1798

* |**************************************************************************

1799

1799

1800

* | | | | | |

1800

* | | | | | |

1801

1801

1802

* ***************************************************************************

1802

* ***************************************************************************

1803

*

1803

*

1804

* (+) the mount is moved to the destination. And is then propagated to

1804

* (+) the mount is moved to the destination. And is then propagated to

1805

* all the mounts in the propagation tree of the destination mount.

1805

* all the mounts in the propagation tree of the destination mount.

1806

* (+*) the mount is moved to the destination.

1806

* (+*) the mount is moved to the destination.

1807

* (+++) the mount is moved to the destination and is then propagated to

1807

* (+++) the mount is moved to the destination and is then propagated to

1808

* all the mounts belonging to the destination mount's propagation tree.

1808

* all the mounts belonging to the destination mount's propagation tree.

1809

* the mount is marked as 'shared and slave'.

1809

* the mount is marked as 'shared and slave'.

1810

* (*) the mount continues to be a slave at the new location.

1810

* (*) the mount continues to be a slave at the new location.

1811

*

1811

*

1812

* if the source mount is a tree, the operations explained above is

1812

* if the source mount is a tree, the operations explained above is

1813

* applied to each mount in the tree.

1813

* applied to each mount in the tree.

1814

* Must be called without spinlocks held, since this function can sleep

1814

* Must be called without spinlocks held, since this function can sleep

1815

* in allocations.

1815

* in allocations.

1816

*/

1816

*/

1817

static int attach_recursive_mnt(struct mount *source_mnt,

1817

static int attach_recursive_mnt(struct mount *source_mnt,

1818

struct mount *dest_mnt,

1818

struct mount *dest_mnt,

1819

struct mountpoint *dest_mp,

1819

struct mountpoint *dest_mp,

1820

struct path *parent_path)

1820

struct path *parent_path)

1821

{

1821

{

1822

HLIST_HEAD(tree_list);

1822

HLIST_HEAD(tree_list);

1823

struct mount *child, *p;

1823

struct mount *child, *p;

1824

struct hlist_node *n;

1824

struct hlist_node *n;

1825

int err;

1825

int err;

1826

1827

if (IS_MNT_SHARED(dest_mnt)) {

1827

if (IS_MNT_SHARED(dest_mnt)) {

1828

err = invent_group_ids(source_mnt, true);

1828

err = invent_group_ids(source_mnt, true);

1829

if (err)

1829

if (err)

1830

goto out;

1830

goto out;

1831

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1831

err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);

1832

lock_mount_hash();

1832

lock_mount_hash();

1833

if (err)

1833

if (err)

1834

goto out_cleanup_ids;

1834

goto out_cleanup_ids;

1835

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1835

for (p = source_mnt; p; p = next_mnt(p, source_mnt))

1836

set_mnt_shared(p);

1836

set_mnt_shared(p);

1837

} else {

1837

} else {

1838

lock_mount_hash();

1838

lock_mount_hash();

1839

}

1839

}

1840

if (parent_path) {

1840

if (parent_path) {

1841

detach_mnt(source_mnt, parent_path);

1841

detach_mnt(source_mnt, parent_path);

1842

attach_mnt(source_mnt, dest_mnt, dest_mp);

1842

attach_mnt(source_mnt, dest_mnt, dest_mp);

1843

touch_mnt_namespace(source_mnt->mnt_ns);

1843

touch_mnt_namespace(source_mnt->mnt_ns);

1844

} else {

1844

} else {

1845

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1845

mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);

1846

commit_tree(source_mnt, NULL);

1846

commit_tree(source_mnt, NULL);

1847

}

1847

}

1848

1849

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1849

hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {

1850

struct mount *q;

1850

struct mount *q;

1851

hlist_del_init(&child->mnt_hash);

1851

hlist_del_init(&child->mnt_hash);

1852

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1852

q = __lookup_mnt_last(&child->mnt_parent->mnt,

1853

child->mnt_mountpoint);

1853

child->mnt_mountpoint);

1854

commit_tree(child, q);

1854

commit_tree(child, q);

1855

}

1855

}

1856

unlock_mount_hash();

1856

unlock_mount_hash();

1857

1858

return 0;

1858

return 0;

1859

1860

out_cleanup_ids:

1860

out_cleanup_ids:

1861

while (!hlist_empty(&tree_list)) {

1861

while (!hlist_empty(&tree_list)) {

1862

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1862

child = hlist_entry(tree_list.first, struct mount, mnt_hash);

1863

umount_tree(child, 0);

1863

umount_tree(child, 0);

1864

}

1864

}

1865

unlock_mount_hash();

1865

unlock_mount_hash();

1866

cleanup_group_ids(source_mnt, NULL);

1866

cleanup_group_ids(source_mnt, NULL);

1867

out:

1867

out:

1868

return err;

1868

return err;

1869

}

1869

}

1870

1871

static struct mountpoint *lock_mount(struct path *path)

1871

static struct mountpoint *lock_mount(struct path *path)

1872

{

1872

{

1873

struct vfsmount *mnt;

1873

struct vfsmount *mnt;

1874

struct dentry *dentry = path->dentry;

1874

struct dentry *dentry = path->dentry;

1875

retry:

1875

retry:

1876

mutex_lock(&dentry->d_inode->i_mutex);

1876

mutex_lock(&dentry->d_inode->i_mutex);

1877

if (unlikely(cant_mount(dentry))) {

1877

if (unlikely(cant_mount(dentry))) {

1878

mutex_unlock(&dentry->d_inode->i_mutex);

1878

mutex_unlock(&dentry->d_inode->i_mutex);

1879

return ERR_PTR(-ENOENT);

1879

return ERR_PTR(-ENOENT);

1880

}

1880

}

1881

namespace_lock();

1881

namespace_lock();

1882

mnt = lookup_mnt(path);

1882

mnt = lookup_mnt(path);

1883

if (likely(!mnt)) {

1883

if (likely(!mnt)) {

1884

struct mountpoint *mp = lookup_mountpoint(dentry);

1884

struct mountpoint *mp = lookup_mountpoint(dentry);

1885

if (!mp)

1885

if (!mp)

1886

mp = new_mountpoint(dentry);

1886

mp = new_mountpoint(dentry);

1887

if (IS_ERR(mp)) {

1887

if (IS_ERR(mp)) {

1888

namespace_unlock();

1888

namespace_unlock();

1889

mutex_unlock(&dentry->d_inode->i_mutex);

1889

mutex_unlock(&dentry->d_inode->i_mutex);

1890

return mp;

1890

return mp;

1891

}

1891

}

1892

return mp;

1892

return mp;

1893

}

1893

}

1894

namespace_unlock();

1894

namespace_unlock();

1895

mutex_unlock(&path->dentry->d_inode->i_mutex);

1895

mutex_unlock(&path->dentry->d_inode->i_mutex);

1896

path_put(path);

1896

path_put(path);

1897

path->mnt = mnt;

1897

path->mnt = mnt;

1898

dentry = path->dentry = dget(mnt->mnt_root);

1898

dentry = path->dentry = dget(mnt->mnt_root);

1899

goto retry;

1899

goto retry;

1900

}

1900

}

1901

1902

static void unlock_mount(struct mountpoint *where)

1902

static void unlock_mount(struct mountpoint *where)

1903

{

1903

{

1904

struct dentry *dentry = where->m_dentry;

1904

struct dentry *dentry = where->m_dentry;

1905

put_mountpoint(where);

1905

put_mountpoint(where);

1906

namespace_unlock();

1906

namespace_unlock();

1907

mutex_unlock(&dentry->d_inode->i_mutex);

1907

mutex_unlock(&dentry->d_inode->i_mutex);

1908

}

1908

}

1909

1910

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1910

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

1911

{

1911

{

1912

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1912

if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)

1913

return -EINVAL;

1913

return -EINVAL;

1914

1915

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1915

if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=

1916

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1916

S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))

1917

return -ENOTDIR;

1917

return -ENOTDIR;

1918

1919

return attach_recursive_mnt(mnt, p, mp, NULL);

1919

return attach_recursive_mnt(mnt, p, mp, NULL);

1920

}

1920

}

1921

1922

/*

1922

/*

1923

* Sanity check the flags to change_mnt_propagation.

1923

* Sanity check the flags to change_mnt_propagation.

1924

*/

1924

*/

1925

1926

static int flags_to_propagation_type(int flags)

1926

static int flags_to_propagation_type(int flags)

1927

{

1927

{

1928

int type = flags & ~(MS_REC | MS_SILENT);

1928

int type = flags & ~(MS_REC | MS_SILENT);

1929

1930

/* Fail if any non-propagation flags are set */

1930

/* Fail if any non-propagation flags are set */

1931

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1931

if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

1932

return 0;

1932

return 0;

1933

/* Only one propagation flag should be set */

1933

/* Only one propagation flag should be set */

1934

if (!is_power_of_2(type))

1934

if (!is_power_of_2(type))

1935

return 0;

1935

return 0;

1936

return type;

1936

return type;

1937

}

1937

}

1938

1939

/*

1939

/*

1940

* recursively change the type of the mountpoint.

1940

* recursively change the type of the mountpoint.

1941

*/

1941

*/

1942

static int do_change_type(struct path *path, int flag)

1942

static int do_change_type(struct path *path, int flag)

1943

{

1943

{

1944

struct mount *m;

1944

struct mount *m;

1945

struct mount *mnt = real_mount(path->mnt);

1945

struct mount *mnt = real_mount(path->mnt);

1946

int recurse = flag & MS_REC;

1946

int recurse = flag & MS_REC;

1947

int type;

1947

int type;

1948

int err = 0;

1948

int err = 0;

1949

1950

if (path->dentry != path->mnt->mnt_root)

1950

if (path->dentry != path->mnt->mnt_root)

1951

return -EINVAL;

1951

return -EINVAL;

1952

1953

type = flags_to_propagation_type(flag);

1953

type = flags_to_propagation_type(flag);

1954

if (!type)

1954

if (!type)

1955

return -EINVAL;

1955

return -EINVAL;

1956

1957

namespace_lock();

1957

namespace_lock();

1958

if (type == MS_SHARED) {

1958

if (type == MS_SHARED) {

1959

err = invent_group_ids(mnt, recurse);

1959

err = invent_group_ids(mnt, recurse);

1960

if (err)

1960

if (err)

1961

goto out_unlock;

1961

goto out_unlock;

1962

}

1962

}

1963

1964

lock_mount_hash();

1964

lock_mount_hash();

1965

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1965

for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))

1966

change_mnt_propagation(m, type);

1966

change_mnt_propagation(m, type);

1967

unlock_mount_hash();

1967

unlock_mount_hash();

1968

1969

out_unlock:

1969

out_unlock:

1970

namespace_unlock();

1970

namespace_unlock();

1971

return err;

1971

return err;

1972

}

1972

}

1973

1974

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1974

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)

1975

{

1975

{

1976

struct mount *child;

1976

struct mount *child;

1977

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1977

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

1978

if (!is_subdir(child->mnt_mountpoint, dentry))

1978

if (!is_subdir(child->mnt_mountpoint, dentry))

1979

continue;

1979

continue;

1980

1981

if (child->mnt.mnt_flags & MNT_LOCKED)

1981

if (child->mnt.mnt_flags & MNT_LOCKED)

1982

return true;

1982

return true;

1983

}

1983

}

1984

return false;

1984

return false;

1985

}

1985

}

1986

1987

/*

1987

/*

1988

* do loopback mount.

1988

* do loopback mount.

1989

*/

1989

*/

1990

static int do_loopback(struct path *path, const char *old_name,

1990

static int do_loopback(struct path *path, const char *old_name,

1991

int recurse)

1991

int recurse)

1992

{

1992

{

1993

struct path old_path;

1993

struct path old_path;

1994

struct mount *mnt = NULL, *old, *parent;

1994

struct mount *mnt = NULL, *old, *parent;

1995

struct mountpoint *mp;

1995

struct mountpoint *mp;

1996

int err;

1996

int err;

1997

if (!old_name || !*old_name)

1997

if (!old_name || !*old_name)

1998

return -EINVAL;

1998

return -EINVAL;

1999

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

1999

err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);

2000

if (err)

2000

if (err)

2001

return err;

2001

return err;

2002

2003

err = -EINVAL;

2003

err = -EINVAL;

2004

if (mnt_ns_loop(old_path.dentry))

2004

if (mnt_ns_loop(old_path.dentry))

2005

goto out;

2005

goto out;

2006

2007

mp = lock_mount(path);

2007

mp = lock_mount(path);

2008

err = PTR_ERR(mp);

2008

err = PTR_ERR(mp);

2009

if (IS_ERR(mp))

2009

if (IS_ERR(mp))

2010

goto out;

2010

goto out;

2011

2012

old = real_mount(old_path.mnt);

2012

old = real_mount(old_path.mnt);

2013

parent = real_mount(path->mnt);

2013

parent = real_mount(path->mnt);

2014

2015

err = -EINVAL;

2015

err = -EINVAL;

2016

if (IS_MNT_UNBINDABLE(old))

2016

if (IS_MNT_UNBINDABLE(old))

2017

goto out2;

2017

goto out2;

2018

2019

if (!check_mnt(parent) || !check_mnt(old))

2019

if (!check_mnt(parent))

2020

goto out2;

2021

2022

if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)

2020

goto out2;

2023

goto out2;

2021

2024

2022

if (!recurse && has_locked_children(old, old_path.dentry))

2025

if (!recurse && has_locked_children(old, old_path.dentry))

2023

goto out2;

2026

goto out2;

2024

2027

2025

if (recurse)

2028

if (recurse)

2026

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2029

mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);

2027

else

2030

else

2028

mnt = clone_mnt(old, old_path.dentry, 0);

2031

mnt = clone_mnt(old, old_path.dentry, 0);

2029

2032

2030

if (IS_ERR(mnt)) {

2033

if (IS_ERR(mnt)) {

2031

err = PTR_ERR(mnt);

2034

err = PTR_ERR(mnt);

2032

goto out2;

2035

goto out2;

2033

}

2036

}

2034

2037

2035

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2038

mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2036

2039

2037

err = graft_tree(mnt, parent, mp);

2040

err = graft_tree(mnt, parent, mp);

2038

if (err) {

2041

if (err) {

2039

lock_mount_hash();

2042

lock_mount_hash();

2040

umount_tree(mnt, 0);

2043

umount_tree(mnt, 0);

2041

unlock_mount_hash();

2044

unlock_mount_hash();

2042

}

2045

}

2043

out2:

2046

out2:

2044

unlock_mount(mp);

2047

unlock_mount(mp);

2045

out:

2048

out:

2046

path_put(&old_path);

2049

path_put(&old_path);

2047

return err;

2050

return err;

2048

}

2051

}

2049

2052

2050

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2053

static int change_mount_flags(struct vfsmount *mnt, int ms_flags)

2051

{

2054

{

2052

int error = 0;

2055

int error = 0;

2053

int readonly_request = 0;

2056

int readonly_request = 0;

2054

2057

2055

if (ms_flags & MS_RDONLY)

2058

if (ms_flags & MS_RDONLY)

2056

readonly_request = 1;

2059

readonly_request = 1;

2057

if (readonly_request == __mnt_is_readonly(mnt))

2060

if (readonly_request == __mnt_is_readonly(mnt))

2058

return 0;

2061

return 0;

2059

2062

2060

if (readonly_request)

2063

if (readonly_request)

2061

error = mnt_make_readonly(real_mount(mnt));

2064

error = mnt_make_readonly(real_mount(mnt));

2062

else

2065

else

2063

__mnt_unmake_readonly(real_mount(mnt));

2066

__mnt_unmake_readonly(real_mount(mnt));

2064

return error;

2067

return error;

2065

}

2068

}

2066

2069

2067

/*

2070

/*

2068

* change filesystem flags. dir should be a physical root of filesystem.

2071

* change filesystem flags. dir should be a physical root of filesystem.

2069

* If you've mounted a non-root directory somewhere and want to do remount

2072

* If you've mounted a non-root directory somewhere and want to do remount

2070

* on it - tough luck.

2073

* on it - tough luck.

2071

*/

2074

*/

2072

static int do_remount(struct path *path, int flags, int mnt_flags,

2075

static int do_remount(struct path *path, int flags, int mnt_flags,

2073

void *data)

2076

void *data)

2074

{

2077

{

2075

int err;

2078

int err;

2076

struct super_block *sb = path->mnt->mnt_sb;

2079

struct super_block *sb = path->mnt->mnt_sb;

2077

struct mount *mnt = real_mount(path->mnt);

2080

struct mount *mnt = real_mount(path->mnt);

2078

2081

2079

if (!check_mnt(mnt))

2082

if (!check_mnt(mnt))

2080

return -EINVAL;

2083

return -EINVAL;

2081

2084

2082

if (path->dentry != path->mnt->mnt_root)

2085

if (path->dentry != path->mnt->mnt_root)

2083

return -EINVAL;

2086

return -EINVAL;

2084

2087

2085

/* Don't allow changing of locked mnt flags.

2088

/* Don't allow changing of locked mnt flags.

2086

*

2089

*

2087

* No locks need to be held here while testing the various

2090

* No locks need to be held here while testing the various

2088

* MNT_LOCK flags because those flags can never be cleared

2091

* MNT_LOCK flags because those flags can never be cleared

2089

* once they are set.

2092

* once they are set.

2090

*/

2093

*/

2091

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2094

if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&

2092

!(mnt_flags & MNT_READONLY)) {

2095

!(mnt_flags & MNT_READONLY)) {

2093

return -EPERM;

2096

return -EPERM;

2094

}

2097

}

2095

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2098

if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&

2096

!(mnt_flags & MNT_NODEV)) {

2099

!(mnt_flags & MNT_NODEV)) {

2097

return -EPERM;

2100

return -EPERM;

2098

}

2101

}

2099

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2102

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&

2100

!(mnt_flags & MNT_NOSUID)) {

2103

!(mnt_flags & MNT_NOSUID)) {

2101

return -EPERM;

2104

return -EPERM;

2102

}

2105

}

2103

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2106

if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&

2104

!(mnt_flags & MNT_NOEXEC)) {

2107

!(mnt_flags & MNT_NOEXEC)) {

2105

return -EPERM;

2108

return -EPERM;

2106

}

2109

}

2107

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2110

if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&

2108

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2111

((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {

2109

return -EPERM;

2112

return -EPERM;

2110

}

2113

}

2111

2114

2112

err = security_sb_remount(sb, data);

2115

err = security_sb_remount(sb, data);

2113

if (err)

2116

if (err)

2114

return err;

2117

return err;

2115

2118

2116

down_write(&sb->s_umount);

2119

down_write(&sb->s_umount);

2117

if (flags & MS_BIND)

2120

if (flags & MS_BIND)

2118

err = change_mount_flags(path->mnt, flags);

2121

err = change_mount_flags(path->mnt, flags);

2119

else if (!capable(CAP_SYS_ADMIN))

2122

else if (!capable(CAP_SYS_ADMIN))

2120

err = -EPERM;

2123

err = -EPERM;

2121

else

2124

else

2122

err = do_remount_sb(sb, flags, data, 0);

2125

err = do_remount_sb(sb, flags, data, 0);

2123

if (!err) {

2126

if (!err) {

2124

lock_mount_hash();

2127

lock_mount_hash();

2125

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2128

mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;

2126

mnt->mnt.mnt_flags = mnt_flags;

2129

mnt->mnt.mnt_flags = mnt_flags;

2127

touch_mnt_namespace(mnt->mnt_ns);

2130

touch_mnt_namespace(mnt->mnt_ns);

2128

unlock_mount_hash();

2131

unlock_mount_hash();

2129

}

2132

}

2130

up_write(&sb->s_umount);

2133

up_write(&sb->s_umount);

2131

return err;

2134

return err;

2132

}

2135

}

2133

2136

2134

static inline int tree_contains_unbindable(struct mount *mnt)

2137

static inline int tree_contains_unbindable(struct mount *mnt)

2135

{

2138

{

2136

struct mount *p;

2139

struct mount *p;

2137

for (p = mnt; p; p = next_mnt(p, mnt)) {

2140

for (p = mnt; p; p = next_mnt(p, mnt)) {

2138

if (IS_MNT_UNBINDABLE(p))

2141

if (IS_MNT_UNBINDABLE(p))

2139

return 1;

2142

return 1;

2140

}

2143

}

2141

return 0;

2144

return 0;

2142

}

2145

}

2143

2146

2144

static int do_move_mount(struct path *path, const char *old_name)

2147

static int do_move_mount(struct path *path, const char *old_name)

2145

{

2148

{

2146

struct path old_path, parent_path;

2149

struct path old_path, parent_path;

2147

struct mount *p;

2150

struct mount *p;

2148

struct mount *old;

2151

struct mount *old;

2149

struct mountpoint *mp;

2152

struct mountpoint *mp;

2150

int err;

2153

int err;

2151

if (!old_name || !*old_name)

2154

if (!old_name || !*old_name)

2152

return -EINVAL;

2155

return -EINVAL;

2153

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2156

err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);

2154

if (err)

2157

if (err)

2155

return err;

2158

return err;

2156

2159

2157

mp = lock_mount(path);

2160

mp = lock_mount(path);

2158

err = PTR_ERR(mp);

2161

err = PTR_ERR(mp);

2159

if (IS_ERR(mp))

2162

if (IS_ERR(mp))

2160

goto out;

2163

goto out;

2161

2164

2162

old = real_mount(old_path.mnt);

2165

old = real_mount(old_path.mnt);

2163

p = real_mount(path->mnt);

2166

p = real_mount(path->mnt);

2164

2167

2165

err = -EINVAL;

2168

err = -EINVAL;

2166

if (!check_mnt(p) || !check_mnt(old))

2169

if (!check_mnt(p) || !check_mnt(old))

2167

goto out1;

2170

goto out1;

2168

2171

2169

if (old->mnt.mnt_flags & MNT_LOCKED)

2172

if (old->mnt.mnt_flags & MNT_LOCKED)

2170

goto out1;

2173

goto out1;

2171

2174

2172

err = -EINVAL;

2175

err = -EINVAL;

2173

if (old_path.dentry != old_path.mnt->mnt_root)

2176

if (old_path.dentry != old_path.mnt->mnt_root)

2174

goto out1;

2177

goto out1;

2175

2178

2176

if (!mnt_has_parent(old))

2179

if (!mnt_has_parent(old))

2177

goto out1;

2180

goto out1;

2178

2181

2179

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2182

if (S_ISDIR(path->dentry->d_inode->i_mode) !=

2180

S_ISDIR(old_path.dentry->d_inode->i_mode))

2183

S_ISDIR(old_path.dentry->d_inode->i_mode))

2181

goto out1;

2184

goto out1;

2182

/*

2185

/*

2183

* Don't move a mount residing in a shared parent.

2186

* Don't move a mount residing in a shared parent.

2184

*/

2187

*/

2185

if (IS_MNT_SHARED(old->mnt_parent))

2188

if (IS_MNT_SHARED(old->mnt_parent))

2186

goto out1;

2189

goto out1;

2187

/*

2190

/*

2188

* Don't move a mount tree containing unbindable mounts to a destination

2191

* Don't move a mount tree containing unbindable mounts to a destination

2189

* mount which is shared.

2192

* mount which is shared.

2190

*/

2193

*/

2191

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2194

if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))

2192

goto out1;

2195

goto out1;

2193

err = -ELOOP;

2196

err = -ELOOP;

2194

for (; mnt_has_parent(p); p = p->mnt_parent)

2197

for (; mnt_has_parent(p); p = p->mnt_parent)

2195

if (p == old)

2198

if (p == old)

2196

goto out1;

2199

goto out1;

2197

2200

2198

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2201

err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);

2199

if (err)

2202

if (err)

2200

goto out1;

2203

goto out1;

2201

2204

2202

/* if the mount is moved, it should no longer be expire

2205

/* if the mount is moved, it should no longer be expire

2203

* automatically */

2206

* automatically */

2204

list_del_init(&old->mnt_expire);

2207

list_del_init(&old->mnt_expire);

2205

out1:

2208

out1:

2206

unlock_mount(mp);

2209

unlock_mount(mp);

2207

out:

2210

out:

2208

if (!err)

2211

if (!err)

2209

path_put(&parent_path);

2212

path_put(&parent_path);

2210

path_put(&old_path);

2213

path_put(&old_path);

2211

return err;

2214

return err;

2212

}

2215

}

2213

2216

2214

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2217

static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)

2215

{

2218

{

2216

int err;

2219

int err;

2217

const char *subtype = strchr(fstype, '.');

2220

const char *subtype = strchr(fstype, '.');

2218

if (subtype) {

2221

if (subtype) {

2219

subtype++;

2222

subtype++;

2220

err = -EINVAL;

2223

err = -EINVAL;

2221

if (!subtype[0])

2224

if (!subtype[0])

2222

goto err;

2225

goto err;

2223

} else

2226

} else

2224

subtype = "";

2227

subtype = "";

2225

2228

2226

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2229

mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);

2227

err = -ENOMEM;

2230

err = -ENOMEM;

2228

if (!mnt->mnt_sb->s_subtype)

2231

if (!mnt->mnt_sb->s_subtype)

2229

goto err;

2232

goto err;

2230

return mnt;

2233

return mnt;

2231

2234

2232

err:

2235

err:

2233

mntput(mnt);

2236

mntput(mnt);

2234

return ERR_PTR(err);

2237

return ERR_PTR(err);

2235

}

2238

}

2236

2239

2237

/*

2240

/*

2238

* add a mount into a namespace's mount tree

2241

* add a mount into a namespace's mount tree

2239

*/

2242

*/

2240

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2243

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)

2241

{

2244

{

2242

struct mountpoint *mp;

2245

struct mountpoint *mp;

2243

struct mount *parent;

2246

struct mount *parent;

2244

int err;

2247

int err;

2245

2248

2246

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2249

mnt_flags &= ~MNT_INTERNAL_FLAGS;

2247

2250

2248

mp = lock_mount(path);

2251

mp = lock_mount(path);

2249

if (IS_ERR(mp))

2252

if (IS_ERR(mp))

2250

return PTR_ERR(mp);

2253

return PTR_ERR(mp);

2251

2254

2252

parent = real_mount(path->mnt);

2255

parent = real_mount(path->mnt);

2253

err = -EINVAL;

2256

err = -EINVAL;

2254

if (unlikely(!check_mnt(parent))) {

2257

if (unlikely(!check_mnt(parent))) {

2255

/* that's acceptable only for automounts done in private ns */

2258

/* that's acceptable only for automounts done in private ns */

2256

if (!(mnt_flags & MNT_SHRINKABLE))

2259

if (!(mnt_flags & MNT_SHRINKABLE))

2257

goto unlock;

2260

goto unlock;

2258

/* ... and for those we'd better have mountpoint still alive */

2261

/* ... and for those we'd better have mountpoint still alive */

2259

if (!parent->mnt_ns)

2262

if (!parent->mnt_ns)

2260

goto unlock;

2263

goto unlock;

2261

}

2264

}

2262

2265

2263

/* Refuse the same filesystem on the same mount point */

2266

/* Refuse the same filesystem on the same mount point */

2264

err = -EBUSY;

2267

err = -EBUSY;

2265

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2268

if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&

2266

path->mnt->mnt_root == path->dentry)

2269

path->mnt->mnt_root == path->dentry)

2267

goto unlock;

2270

goto unlock;

2268

2271

2269

err = -EINVAL;

2272

err = -EINVAL;

2270

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2273

if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))

2271

goto unlock;

2274

goto unlock;

2272

2275

2273

newmnt->mnt.mnt_flags = mnt_flags;

2276

newmnt->mnt.mnt_flags = mnt_flags;

2274

err = graft_tree(newmnt, parent, mp);

2277

err = graft_tree(newmnt, parent, mp);

2275

2278

2276

unlock:

2279

unlock:

2277

unlock_mount(mp);

2280

unlock_mount(mp);

2278

return err;

2281

return err;

2279

}

2282

}

2280

2283

2281

/*

2284

/*

2282

* create a new mount for userspace and request it to be added into the

2285

* create a new mount for userspace and request it to be added into the

2283

* namespace's tree

2286

* namespace's tree

2284

*/

2287

*/

2285

static int do_new_mount(struct path *path, const char *fstype, int flags,

2288

static int do_new_mount(struct path *path, const char *fstype, int flags,

2286

int mnt_flags, const char *name, void *data)

2289

int mnt_flags, const char *name, void *data)

2287

{

2290

{

2288

struct file_system_type *type;

2291

struct file_system_type *type;

2289

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2292

struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;

2290

struct vfsmount *mnt;

2293

struct vfsmount *mnt;

2291

int err;

2294

int err;

2292

2295

2293

if (!fstype)

2296

if (!fstype)

2294

return -EINVAL;

2297

return -EINVAL;

2295

2298

2296

type = get_fs_type(fstype);

2299

type = get_fs_type(fstype);

2297

if (!type)

2300

if (!type)

2298

return -ENODEV;

2301

return -ENODEV;

2299

2302

2300

if (user_ns != &init_user_ns) {

2303

if (user_ns != &init_user_ns) {

2301

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2304

if (!(type->fs_flags & FS_USERNS_MOUNT)) {

2302

put_filesystem(type);

2305

put_filesystem(type);

2303

return -EPERM;

2306

return -EPERM;

2304

}

2307

}

2305

/* Only in special cases allow devices from mounts

2308

/* Only in special cases allow devices from mounts

2306

* created outside the initial user namespace.

2309

* created outside the initial user namespace.

2307

*/

2310

*/

2308

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2311

if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {

2309

flags |= MS_NODEV;

2312

flags |= MS_NODEV;

2310

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2313

mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;

2311

}

2314

}

2312

}

2315

}

2313

2316

2314

mnt = vfs_kern_mount(type, flags, name, data);

2317

mnt = vfs_kern_mount(type, flags, name, data);

2315

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2318

if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&

2316

!mnt->mnt_sb->s_subtype)

2319

!mnt->mnt_sb->s_subtype)

2317

mnt = fs_set_subtype(mnt, fstype);

2320

mnt = fs_set_subtype(mnt, fstype);

2318

2321

2319

put_filesystem(type);

2322

put_filesystem(type);

2320

if (IS_ERR(mnt))

2323

if (IS_ERR(mnt))

2321

return PTR_ERR(mnt);

2324

return PTR_ERR(mnt);

2322

2325

2323

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2326

err = do_add_mount(real_mount(mnt), path, mnt_flags);

2324

if (err)

2327

if (err)

2325

mntput(mnt);

2328

mntput(mnt);

2326

return err;

2329

return err;

2327

}

2330

}

2328

2331

2329

int finish_automount(struct vfsmount *m, struct path *path)

2332

int finish_automount(struct vfsmount *m, struct path *path)

2330

{

2333

{

2331

struct mount *mnt = real_mount(m);

2334

struct mount *mnt = real_mount(m);

2332

int err;

2335

int err;

2333

/* The new mount record should have at least 2 refs to prevent it being

2336

/* The new mount record should have at least 2 refs to prevent it being

2334

* expired before we get a chance to add it

2337

* expired before we get a chance to add it

2335

*/

2338

*/

2336

BUG_ON(mnt_get_count(mnt) < 2);

2339

BUG_ON(mnt_get_count(mnt) < 2);

2337

2340

2338

if (m->mnt_sb == path->mnt->mnt_sb &&

2341

if (m->mnt_sb == path->mnt->mnt_sb &&

2339

m->mnt_root == path->dentry) {

2342

m->mnt_root == path->dentry) {

2340

err = -ELOOP;

2343

err = -ELOOP;

2341

goto fail;

2344

goto fail;

2342

}

2345

}

2343

2346

2344

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2347

err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);

2345

if (!err)

2348

if (!err)

2346

return 0;

2349

return 0;

2347

fail:

2350

fail:

2348

/* remove m from any expiration list it may be on */

2351

/* remove m from any expiration list it may be on */

2349

if (!list_empty(&mnt->mnt_expire)) {

2352

if (!list_empty(&mnt->mnt_expire)) {

2350

namespace_lock();

2353

namespace_lock();

2351

list_del_init(&mnt->mnt_expire);

2354

list_del_init(&mnt->mnt_expire);

2352

namespace_unlock();

2355

namespace_unlock();

2353

}

2356

}

2354

mntput(m);

2357

mntput(m);

2355

mntput(m);

2358

mntput(m);

2356

return err;

2359

return err;

2357

}

2360

}

2358

2361

2359

/**

2362

/**

2360

* mnt_set_expiry - Put a mount on an expiration list

2363

* mnt_set_expiry - Put a mount on an expiration list

2361

* @mnt: The mount to list.

2364

* @mnt: The mount to list.

2362

* @expiry_list: The list to add the mount to.

2365

* @expiry_list: The list to add the mount to.

2363

*/

2366

*/

2364

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2367

void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)

2365

{

2368

{

2366

namespace_lock();

2369

namespace_lock();

2367

2370

2368

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2371

list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

2369

2372

2370

namespace_unlock();

2373

namespace_unlock();

2371

}

2374

}

2372

EXPORT_SYMBOL(mnt_set_expiry);

2375

EXPORT_SYMBOL(mnt_set_expiry);

2373

2376

2374

/*

2377

/*

2375

* process a list of expirable mountpoints with the intent of discarding any

2378

* process a list of expirable mountpoints with the intent of discarding any

2376

* mountpoints that aren't in use and haven't been touched since last we came

2379

* mountpoints that aren't in use and haven't been touched since last we came

2377

* here

2380

* here

2378

*/

2381

*/

2379

void mark_mounts_for_expiry(struct list_head *mounts)

2382

void mark_mounts_for_expiry(struct list_head *mounts)

2380

{

2383

{

2381

struct mount *mnt, *next;

2384

struct mount *mnt, *next;

2382

LIST_HEAD(graveyard);

2385

LIST_HEAD(graveyard);

2383

2386

2384

if (list_empty(mounts))

2387

if (list_empty(mounts))

2385

return;

2388

return;

2386

2389

2387

namespace_lock();

2390

namespace_lock();

2388

lock_mount_hash();

2391

lock_mount_hash();

2389

2392

2390

/* extract from the expiration list every vfsmount that matches the

2393

/* extract from the expiration list every vfsmount that matches the

2391

* following criteria:

2394

* following criteria:

2392

* - only referenced by its parent vfsmount

2395

* - only referenced by its parent vfsmount

2393

* - still marked for expiry (marked on the last call here; marks are

2396

* - still marked for expiry (marked on the last call here; marks are

2394

* cleared by mntput())

2397

* cleared by mntput())

2395

*/

2398

*/

2396

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2399

list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {

2397

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2400

if (!xchg(&mnt->mnt_expiry_mark, 1) ||

2398

propagate_mount_busy(mnt, 1))

2401

propagate_mount_busy(mnt, 1))

2399

continue;

2402

continue;

2400

list_move(&mnt->mnt_expire, &graveyard);

2403

list_move(&mnt->mnt_expire, &graveyard);

2401

}

2404

}

2402

while (!list_empty(&graveyard)) {

2405

while (!list_empty(&graveyard)) {

2403

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2406

mnt = list_first_entry(&graveyard, struct mount, mnt_expire);

2404

touch_mnt_namespace(mnt->mnt_ns);

2407

touch_mnt_namespace(mnt->mnt_ns);

2405

umount_tree(mnt, 1);

2408

umount_tree(mnt, 1);

2406

}

2409

}

2407

unlock_mount_hash();

2410

unlock_mount_hash();

2408

namespace_unlock();

2411

namespace_unlock();

2409

}

2412

}

2410

2413

2411

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2414

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

2412

2415

2413

/*

2416

/*

2414

* Ripoff of 'select_parent()'

2417

* Ripoff of 'select_parent()'

2415

*

2418

*

2416

* search the list of submounts for a given mountpoint, and move any

2419

* search the list of submounts for a given mountpoint, and move any

2417

* shrinkable submounts to the 'graveyard' list.

2420

* shrinkable submounts to the 'graveyard' list.

2418

*/

2421

*/

2419

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2422

static int select_submounts(struct mount *parent, struct list_head *graveyard)

2420

{

2423

{

2421

struct mount *this_parent = parent;

2424

struct mount *this_parent = parent;

2422

struct list_head *next;

2425

struct list_head *next;

2423

int found = 0;

2426

int found = 0;

2424

2427

2425

repeat:

2428

repeat:

2426

next = this_parent->mnt_mounts.next;

2429

next = this_parent->mnt_mounts.next;

2427

resume:

2430

resume:

2428

while (next != &this_parent->mnt_mounts) {

2431

while (next != &this_parent->mnt_mounts) {

2429

struct list_head *tmp = next;

2432

struct list_head *tmp = next;

2430

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2433

struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

2431

2434

2432

next = tmp->next;

2435

next = tmp->next;

2433

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2436

if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))

2434

continue;

2437

continue;

2435

/*

2438

/*

2436

* Descend a level if the d_mounts list is non-empty.

2439

* Descend a level if the d_mounts list is non-empty.

2437

*/

2440

*/

2438

if (!list_empty(&mnt->mnt_mounts)) {

2441

if (!list_empty(&mnt->mnt_mounts)) {

2439

this_parent = mnt;

2442

this_parent = mnt;

2440

goto repeat;

2443

goto repeat;

2441

}

2444

}

2442

2445

2443

if (!propagate_mount_busy(mnt, 1)) {

2446

if (!propagate_mount_busy(mnt, 1)) {

2444

list_move_tail(&mnt->mnt_expire, graveyard);

2447

list_move_tail(&mnt->mnt_expire, graveyard);

2445

found++;

2448

found++;

2446

}

2449

}

2447

}

2450

}

2448

/*

2451

/*

2449

* All done at this level ... ascend and resume the search

2452

* All done at this level ... ascend and resume the search

2450

*/

2453

*/

2451

if (this_parent != parent) {

2454

if (this_parent != parent) {

2452

next = this_parent->mnt_child.next;

2455

next = this_parent->mnt_child.next;

2453

this_parent = this_parent->mnt_parent;

2456

this_parent = this_parent->mnt_parent;

2454

goto resume;

2457

goto resume;

2455

}

2458

}

2456

return found;

2459

return found;

2457

}

2460

}

2458

2461

2459

/*

2462

/*

2460

* process a list of expirable mountpoints with the intent of discarding any

2463

* process a list of expirable mountpoints with the intent of discarding any

2461

* submounts of a specific parent mountpoint

2464

* submounts of a specific parent mountpoint

2462

*

2465

*

2463

* mount_lock must be held for write

2466

* mount_lock must be held for write

2464

*/

2467

*/

2465

static void shrink_submounts(struct mount *mnt)

2468

static void shrink_submounts(struct mount *mnt)

2466

{

2469

{

2467

LIST_HEAD(graveyard);

2470

LIST_HEAD(graveyard);

2468

struct mount *m;

2471

struct mount *m;

2469

2472

2470

/* extract submounts of 'mountpoint' from the expiration list */

2473

/* extract submounts of 'mountpoint' from the expiration list */

2471

while (select_submounts(mnt, &graveyard)) {

2474

while (select_submounts(mnt, &graveyard)) {

2472

while (!list_empty(&graveyard)) {

2475

while (!list_empty(&graveyard)) {

2473

m = list_first_entry(&graveyard, struct mount,

2476

m = list_first_entry(&graveyard, struct mount,

2474

mnt_expire);

2477

mnt_expire);

2475

touch_mnt_namespace(m->mnt_ns);

2478

touch_mnt_namespace(m->mnt_ns);

2476

umount_tree(m, 1);

2479

umount_tree(m, 1);

2477

}

2480

}

2478

}

2481

}

2479

}

2482

}

2480

2483

2481

/*

2484

/*

2482

* Some copy_from_user() implementations do not return the exact number of

2485

* Some copy_from_user() implementations do not return the exact number of

2483

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2486

* bytes remaining to copy on a fault. But copy_mount_options() requires that.

2484

* Note that this function differs from copy_from_user() in that it will oops

2487

* Note that this function differs from copy_from_user() in that it will oops

2485

* on bad values of `to', rather than returning a short copy.

2488

* on bad values of `to', rather than returning a short copy.

2486

*/

2489

*/

2487

static long exact_copy_from_user(void *to, const void __user * from,

2490

static long exact_copy_from_user(void *to, const void __user * from,

2488

unsigned long n)

2491

unsigned long n)

2489

{

2492

{

2490

char *t = to;

2493

char *t = to;

2491

const char __user *f = from;

2494

const char __user *f = from;

2492

char c;

2495

char c;

2493

2496

2494

if (!access_ok(VERIFY_READ, from, n))

2497

if (!access_ok(VERIFY_READ, from, n))

2495

return n;

2498

return n;

2496

2499

2497

while (n) {

2500

while (n) {

2498

if (__get_user(c, f)) {

2501

if (__get_user(c, f)) {

2499

memset(t, 0, n);

2502

memset(t, 0, n);

2500

break;

2503

break;

2501

}

2504

}

2502

*t++ = c;

2505

*t++ = c;

2503

f++;

2506

f++;

2504

n--;

2507

n--;

2505

}

2508

}

2506

return n;

2509

return n;

2507

}

2510

}

2508

2511

2509

int copy_mount_options(const void __user * data, unsigned long *where)

2512

int copy_mount_options(const void __user * data, unsigned long *where)

2510

{

2513

{

2511

int i;

2514

int i;

2512

unsigned long page;

2515

unsigned long page;

2513

unsigned long size;

2516

unsigned long size;

2514

2517

2515

*where = 0;

2518

*where = 0;

2516

if (!data)

2519

if (!data)

2517

return 0;

2520

return 0;

2518

2521

2519

if (!(page = __get_free_page(GFP_KERNEL)))

2522

if (!(page = __get_free_page(GFP_KERNEL)))

2520

return -ENOMEM;

2523

return -ENOMEM;

2521

2524

2522

/* We only care that *some* data at the address the user

2525

/* We only care that *some* data at the address the user

2523

* gave us is valid. Just in case, we'll zero

2526

* gave us is valid. Just in case, we'll zero

2524

* the remainder of the page.

2527

* the remainder of the page.

2525

*/

2528

*/

2526

/* copy_from_user cannot cross TASK_SIZE ! */

2529

/* copy_from_user cannot cross TASK_SIZE ! */

2527

size = TASK_SIZE - (unsigned long)data;

2530

size = TASK_SIZE - (unsigned long)data;

2528

if (size > PAGE_SIZE)

2531

if (size > PAGE_SIZE)

2529

size = PAGE_SIZE;

2532

size = PAGE_SIZE;

2530

2533

2531

i = size - exact_copy_from_user((void *)page, data, size);

2534

i = size - exact_copy_from_user((void *)page, data, size);

2532

if (!i) {

2535

if (!i) {

2533

free_page(page);

2536

free_page(page);

2534

return -EFAULT;

2537

return -EFAULT;

2535

}

2538

}

2536

if (i != PAGE_SIZE)

2539

if (i != PAGE_SIZE)

2537

memset((char *)page + i, 0, PAGE_SIZE - i);

2540

memset((char *)page + i, 0, PAGE_SIZE - i);

2538

*where = page;

2541

*where = page;

2539

return 0;

2542

return 0;

2540

}

2543

}

2541

2544

2542

char *copy_mount_string(const void __user *data)

2545

char *copy_mount_string(const void __user *data)

2543

{

2546

{

2544

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2547

return data ? strndup_user(data, PAGE_SIZE) : NULL;

2545

}

2548

}

2546

2549

2547

/*

2550

/*

2548

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2551

* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to

2549

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2552

* be given to the mount() call (ie: read-only, no-dev, no-suid etc).

2550

*

2553

*

2551

* data is a (void *) that can point to any structure up to

2554

* data is a (void *) that can point to any structure up to

2552

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2555

* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent

2553

* information (or be NULL).

2556

* information (or be NULL).

2554

*

2557

*

2555

* Pre-0.97 versions of mount() didn't have a flags word.

2558

* Pre-0.97 versions of mount() didn't have a flags word.

2556

* When the flags word was introduced its top half was required

2559

* When the flags word was introduced its top half was required

2557

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2560

* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.

2558

* Therefore, if this magic number is present, it carries no information

2561

* Therefore, if this magic number is present, it carries no information

2559

* and must be discarded.

2562

* and must be discarded.

2560

*/

2563

*/

2561

long do_mount(const char *dev_name, const char __user *dir_name,

2564

long do_mount(const char *dev_name, const char __user *dir_name,

2562

const char *type_page, unsigned long flags, void *data_page)

2565

const char *type_page, unsigned long flags, void *data_page)

2563

{

2566

{

2564

struct path path;

2567

struct path path;

2565

int retval = 0;

2568

int retval = 0;

2566

int mnt_flags = 0;

2569

int mnt_flags = 0;

2567

2570

2568

/* Discard magic */

2571

/* Discard magic */

2569

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2572

if ((flags & MS_MGC_MSK) == MS_MGC_VAL)

2570

flags &= ~MS_MGC_MSK;

2573

flags &= ~MS_MGC_MSK;

2571

2574

2572

/* Basic sanity checks */

2575

/* Basic sanity checks */

2573

if (data_page)

2576

if (data_page)

2574

((char *)data_page)[PAGE_SIZE - 1] = 0;

2577

((char *)data_page)[PAGE_SIZE - 1] = 0;

2575

2578

2576

/* ... and get the mountpoint */

2579

/* ... and get the mountpoint */

2577

retval = user_path(dir_name, &path);

2580

retval = user_path(dir_name, &path);

2578

if (retval)

2581

if (retval)

2579

return retval;

2582

return retval;

2580

2583

2581

retval = security_sb_mount(dev_name, &path,

2584

retval = security_sb_mount(dev_name, &path,

2582

type_page, flags, data_page);

2585

type_page, flags, data_page);

2583

if (!retval && !may_mount())

2586

if (!retval && !may_mount())

2584

retval = -EPERM;

2587

retval = -EPERM;

2585

if (retval)

2588

if (retval)

2586

goto dput_out;

2589

goto dput_out;

2587

2590

2588

/* Default to relatime unless overriden */

2591

/* Default to relatime unless overriden */

2589

if (!(flags & MS_NOATIME))

2592

if (!(flags & MS_NOATIME))

2590

mnt_flags |= MNT_RELATIME;

2593

mnt_flags |= MNT_RELATIME;

2591

2594

2592

/* Separate the per-mountpoint flags */

2595

/* Separate the per-mountpoint flags */

2593

if (flags & MS_NOSUID)

2596

if (flags & MS_NOSUID)

2594

mnt_flags |= MNT_NOSUID;

2597

mnt_flags |= MNT_NOSUID;

2595

if (flags & MS_NODEV)

2598

if (flags & MS_NODEV)

2596

mnt_flags |= MNT_NODEV;

2599

mnt_flags |= MNT_NODEV;

2597

if (flags & MS_NOEXEC)

2600

if (flags & MS_NOEXEC)

2598

mnt_flags |= MNT_NOEXEC;

2601

mnt_flags |= MNT_NOEXEC;

2599

if (flags & MS_NOATIME)

2602

if (flags & MS_NOATIME)

2600

mnt_flags |= MNT_NOATIME;

2603

mnt_flags |= MNT_NOATIME;

2601

if (flags & MS_NODIRATIME)

2604

if (flags & MS_NODIRATIME)

2602

mnt_flags |= MNT_NODIRATIME;

2605

mnt_flags |= MNT_NODIRATIME;

2603

if (flags & MS_STRICTATIME)

2606

if (flags & MS_STRICTATIME)

2604

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2607

mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);

2605

if (flags & MS_RDONLY)

2608

if (flags & MS_RDONLY)

2606

mnt_flags |= MNT_READONLY;

2609

mnt_flags |= MNT_READONLY;

2607

2610

2608

/* The default atime for remount is preservation */

2611

/* The default atime for remount is preservation */

2609

if ((flags & MS_REMOUNT) &&

2612

if ((flags & MS_REMOUNT) &&

2610

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2613

((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |

2611

MS_STRICTATIME)) == 0)) {

2614

MS_STRICTATIME)) == 0)) {

2612

mnt_flags &= ~MNT_ATIME_MASK;

2615

mnt_flags &= ~MNT_ATIME_MASK;

2613

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2616

mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;

2614

}

2617

}

2615

2618

2616

2619

2617

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2620

MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |

2618

MS_STRICTATIME);

2621

MS_STRICTATIME);

2619

2622

2620

if (flags & MS_REMOUNT)

2623

if (flags & MS_REMOUNT)

2621

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2624

retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,

2622

data_page);

2625

data_page);

2623

else if (flags & MS_BIND)

2626

else if (flags & MS_BIND)

2624

retval = do_loopback(&path, dev_name, flags & MS_REC);

2627

retval = do_loopback(&path, dev_name, flags & MS_REC);

2625

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2628

else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

2626

retval = do_change_type(&path, flags);

2629

retval = do_change_type(&path, flags);

2627

else if (flags & MS_MOVE)

2630

else if (flags & MS_MOVE)

2628

retval = do_move_mount(&path, dev_name);

2631

retval = do_move_mount(&path, dev_name);

2629

else

2632

else

2630

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2633

retval = do_new_mount(&path, type_page, flags, mnt_flags,

2631

dev_name, data_page);

2634

dev_name, data_page);

2632

dput_out:

2635

dput_out:

2633

path_put(&path);

2636

path_put(&path);

2634

return retval;

2637

return retval;

2635

}

2638

}

2636

2639

2637

static void free_mnt_ns(struct mnt_namespace *ns)

2640

static void free_mnt_ns(struct mnt_namespace *ns)

2638

{

2641

{

2639

ns_free_inum(&ns->ns);

2642

ns_free_inum(&ns->ns);

2640

put_user_ns(ns->user_ns);

2643

put_user_ns(ns->user_ns);

2641

kfree(ns);

2644

kfree(ns);

2642

}

2645

}

2643

2646

2644

/*

2647

/*

2645

* Assign a sequence number so we can detect when we attempt to bind

2648

* Assign a sequence number so we can detect when we attempt to bind

2646

* mount a reference to an older mount namespace into the current

2649

* mount a reference to an older mount namespace into the current

2647

* mount namespace, preventing reference counting loops. A 64bit

2650

* mount namespace, preventing reference counting loops. A 64bit

2648

* number incrementing at 10Ghz will take 12,427 years to wrap which

2651

* number incrementing at 10Ghz will take 12,427 years to wrap which

2649

* is effectively never, so we can ignore the possibility.

2652

* is effectively never, so we can ignore the possibility.

2650

*/

2653

*/

2651

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2654

static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

2652

2655

2653

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2656

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)

2654

{

2657

{

2655

struct mnt_namespace *new_ns;

2658

struct mnt_namespace *new_ns;

2656

int ret;

2659

int ret;

2657

2660

2658

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2661

new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);

2659

if (!new_ns)

2662

if (!new_ns)

2660

return ERR_PTR(-ENOMEM);

2663

return ERR_PTR(-ENOMEM);

2661

ret = ns_alloc_inum(&new_ns->ns);

2664

ret = ns_alloc_inum(&new_ns->ns);

2662

if (ret) {

2665

if (ret) {

2663

kfree(new_ns);

2666

kfree(new_ns);

2664

return ERR_PTR(ret);

2667

return ERR_PTR(ret);

2665

}

2668

}

2666

new_ns->ns.ops = &mntns_operations;

2669

new_ns->ns.ops = &mntns_operations;

2667

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2670

new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);

2668

atomic_set(&new_ns->count, 1);

2671

atomic_set(&new_ns->count, 1);

2669

new_ns->root = NULL;

2672

new_ns->root = NULL;

2670

INIT_LIST_HEAD(&new_ns->list);

2673

INIT_LIST_HEAD(&new_ns->list);

2671

init_waitqueue_head(&new_ns->poll);

2674

init_waitqueue_head(&new_ns->poll);

2672

new_ns->event = 0;

2675

new_ns->event = 0;

2673

new_ns->user_ns = get_user_ns(user_ns);

2676

new_ns->user_ns = get_user_ns(user_ns);

2674

return new_ns;

2677

return new_ns;

2675

}

2678

}

2676

2679

2677

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2680

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,

2678

struct user_namespace *user_ns, struct fs_struct *new_fs)

2681

struct user_namespace *user_ns, struct fs_struct *new_fs)

2679

{

2682

{

2680

struct mnt_namespace *new_ns;

2683

struct mnt_namespace *new_ns;

2681

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2684

struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;

2682

struct mount *p, *q;

2685

struct mount *p, *q;

2683

struct mount *old;

2686

struct mount *old;

2684

struct mount *new;

2687

struct mount *new;

2685

int copy_flags;

2688

int copy_flags;

2686

2689

2687

BUG_ON(!ns);

2690

BUG_ON(!ns);

2688

2691

2689

if (likely(!(flags & CLONE_NEWNS))) {

2692

if (likely(!(flags & CLONE_NEWNS))) {

2690

get_mnt_ns(ns);

2693

get_mnt_ns(ns);

2691

return ns;

2694

return ns;

2692

}

2695

}

2693

2696

2694

old = ns->root;

2697

old = ns->root;

2695

2698

2696

new_ns = alloc_mnt_ns(user_ns);

2699

new_ns = alloc_mnt_ns(user_ns);

2697

if (IS_ERR(new_ns))

2700

if (IS_ERR(new_ns))

2698

return new_ns;

2701

return new_ns;

2699

2702

2700

namespace_lock();

2703

namespace_lock();

2701

/* First pass: copy the tree topology */

2704

/* First pass: copy the tree topology */

2702

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2705

copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;

2703

if (user_ns != ns->user_ns)

2706

if (user_ns != ns->user_ns)

2704

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2707

copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;

2705

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2708

new = copy_tree(old, old->mnt.mnt_root, copy_flags);

2706

if (IS_ERR(new)) {

2709

if (IS_ERR(new)) {

2707

namespace_unlock();

2710

namespace_unlock();

2708

free_mnt_ns(new_ns);

2711

free_mnt_ns(new_ns);

2709

return ERR_CAST(new);

2712

return ERR_CAST(new);

2710

}

2713

}

2711

new_ns->root = new;

2714

new_ns->root = new;

2712

list_add_tail(&new_ns->list, &new->mnt_list);

2715

list_add_tail(&new_ns->list, &new->mnt_list);

2713

2716

2714

/*

2717

/*

2715

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2718

* Second pass: switch the tsk->fs->* elements and mark new vfsmounts

2716

* as belonging to new namespace. We have already acquired a private

2719

* as belonging to new namespace. We have already acquired a private

2717

* fs_struct, so tsk->fs->lock is not needed.

2720

* fs_struct, so tsk->fs->lock is not needed.

2718

*/

2721

*/

2719

p = old;

2722

p = old;

2720

q = new;

2723

q = new;

2721

while (p) {

2724

while (p) {

2722

q->mnt_ns = new_ns;

2725

q->mnt_ns = new_ns;

2723

if (new_fs) {

2726

if (new_fs) {

2724

if (&p->mnt == new_fs->root.mnt) {

2727

if (&p->mnt == new_fs->root.mnt) {

2725

new_fs->root.mnt = mntget(&q->mnt);

2728

new_fs->root.mnt = mntget(&q->mnt);

2726

rootmnt = &p->mnt;

2729

rootmnt = &p->mnt;

2727

}

2730

}

2728

if (&p->mnt == new_fs->pwd.mnt) {

2731

if (&p->mnt == new_fs->pwd.mnt) {

2729

new_fs->pwd.mnt = mntget(&q->mnt);

2732

new_fs->pwd.mnt = mntget(&q->mnt);

2730

pwdmnt = &p->mnt;

2733

pwdmnt = &p->mnt;

2731

}

2734

}

2732

}

2735

}

2733

p = next_mnt(p, old);

2736

p = next_mnt(p, old);

2734

q = next_mnt(q, new);

2737

q = next_mnt(q, new);

2735

if (!q)

2738

if (!q)

2736

break;

2739

break;

2737

while (p->mnt.mnt_root != q->mnt.mnt_root)

2740

while (p->mnt.mnt_root != q->mnt.mnt_root)

2738

p = next_mnt(p, old);

2741

p = next_mnt(p, old);

2739

}

2742

}

2740

namespace_unlock();

2743

namespace_unlock();

2741

2744

2742

if (rootmnt)

2745

if (rootmnt)

2743

mntput(rootmnt);

2746

mntput(rootmnt);

2744

if (pwdmnt)

2747

if (pwdmnt)

2745

mntput(pwdmnt);

2748

mntput(pwdmnt);

2746

2749

2747

return new_ns;

2750

return new_ns;

2748

}

2751

}

2749

2752

2750

/**

2753

/**

2751

* create_mnt_ns - creates a private namespace and adds a root filesystem

2754

* create_mnt_ns - creates a private namespace and adds a root filesystem

2752

* @mnt: pointer to the new root filesystem mountpoint

2755

* @mnt: pointer to the new root filesystem mountpoint

2753

*/

2756

*/

2754

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2757

static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)

2755

{

2758

{

2756

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2759

struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);

2757

if (!IS_ERR(new_ns)) {

2760

if (!IS_ERR(new_ns)) {

2758

struct mount *mnt = real_mount(m);

2761

struct mount *mnt = real_mount(m);

2759

mnt->mnt_ns = new_ns;

2762

mnt->mnt_ns = new_ns;

2760

new_ns->root = mnt;

2763

new_ns->root = mnt;

2761

list_add(&mnt->mnt_list, &new_ns->list);

2764

list_add(&mnt->mnt_list, &new_ns->list);

2762

} else {

2765

} else {

2763

mntput(m);

2766

mntput(m);

2764

}

2767

}

2765

return new_ns;

2768

return new_ns;

2766

}

2769

}

2767

2770

2768

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2771

struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)

2769

{

2772

{

2770

struct mnt_namespace *ns;

2773

struct mnt_namespace *ns;

2771

struct super_block *s;

2774

struct super_block *s;

2772

struct path path;

2775

struct path path;

2773

int err;

2776

int err;

2774

2777

2775

ns = create_mnt_ns(mnt);

2778

ns = create_mnt_ns(mnt);

2776

if (IS_ERR(ns))

2779

if (IS_ERR(ns))

2777

return ERR_CAST(ns);

2780

return ERR_CAST(ns);

2778

2781

2779

err = vfs_path_lookup(mnt->mnt_root, mnt,

2782

err = vfs_path_lookup(mnt->mnt_root, mnt,

2780

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2783

name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

2781

2784

2782

put_mnt_ns(ns);

2785

put_mnt_ns(ns);

2783

2786

2784

if (err)

2787

if (err)

2785

return ERR_PTR(err);

2788

return ERR_PTR(err);

2786

2789

2787

/* trade a vfsmount reference for active sb one */

2790

/* trade a vfsmount reference for active sb one */

2788

s = path.mnt->mnt_sb;

2791

s = path.mnt->mnt_sb;

2789

atomic_inc(&s->s_active);

2792

atomic_inc(&s->s_active);

2790

mntput(path.mnt);

2793

mntput(path.mnt);

2791

/* lock the sucker */

2794

/* lock the sucker */

2792

down_write(&s->s_umount);

2795

down_write(&s->s_umount);

2793

/* ... and return the root of (sub)tree on it */

2796

/* ... and return the root of (sub)tree on it */

2794

return path.dentry;

2797

return path.dentry;

2795

}

2798

}

2796

EXPORT_SYMBOL(mount_subtree);

2799

EXPORT_SYMBOL(mount_subtree);

2797

2800

2798

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2801

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,

2799

char __user *, type, unsigned long, flags, void __user *, data)

2802

char __user *, type, unsigned long, flags, void __user *, data)

2800

{

2803

{

2801

int ret;

2804

int ret;

2802

char *kernel_type;

2805

char *kernel_type;

2803

char *kernel_dev;

2806

char *kernel_dev;

2804

unsigned long data_page;

2807

unsigned long data_page;

2805

2808

2806

kernel_type = copy_mount_string(type);

2809

kernel_type = copy_mount_string(type);

2807

ret = PTR_ERR(kernel_type);

2810

ret = PTR_ERR(kernel_type);

2808

if (IS_ERR(kernel_type))

2811

if (IS_ERR(kernel_type))

2809

goto out_type;

2812

goto out_type;

2810

2813

2811

kernel_dev = copy_mount_string(dev_name);

2814

kernel_dev = copy_mount_string(dev_name);

2812

ret = PTR_ERR(kernel_dev);

2815

ret = PTR_ERR(kernel_dev);

2813

if (IS_ERR(kernel_dev))

2816

if (IS_ERR(kernel_dev))

2814

goto out_dev;

2817

goto out_dev;

2815

2818

2816

ret = copy_mount_options(data, &data_page);

2819

ret = copy_mount_options(data, &data_page);

2817

if (ret < 0)

2820

if (ret < 0)

2818

goto out_data;

2821

goto out_data;

2819

2822

2820

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2823

ret = do_mount(kernel_dev, dir_name, kernel_type, flags,

2821

(void *) data_page);

2824

(void *) data_page);

2822

2825

2823

free_page(data_page);

2826

free_page(data_page);

2824

out_data:

2827

out_data:

2825

kfree(kernel_dev);

2828

kfree(kernel_dev);

2826

out_dev:

2829

out_dev:

2827

kfree(kernel_type);

2830

kfree(kernel_type);

2828

out_type:

2831

out_type:

2829

return ret;

2832

return ret;

2830

}

2833

}

2831

2834

2832

/*

2835

/*

2833

* Return true if path is reachable from root

2836

* Return true if path is reachable from root

2834

*

2837

*

2835

* namespace_sem or mount_lock is held

2838

* namespace_sem or mount_lock is held

2836

*/

2839

*/

2837

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2840

bool is_path_reachable(struct mount *mnt, struct dentry *dentry,

2838

const struct path *root)

2841

const struct path *root)

2839

{

2842

{

2840

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2843

while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {

2841

dentry = mnt->mnt_mountpoint;

2844

dentry = mnt->mnt_mountpoint;

2842

mnt = mnt->mnt_parent;

2845

mnt = mnt->mnt_parent;

2843

}

2846

}

2844

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2847

return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);

2845

}

2848

}

2846

2849

2847

int path_is_under(struct path *path1, struct path *path2)

2850

int path_is_under(struct path *path1, struct path *path2)

2848

{

2851

{

2849

int res;

2852

int res;

2850

read_seqlock_excl(&mount_lock);

2853

read_seqlock_excl(&mount_lock);

2851

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2854

res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);

2852

read_sequnlock_excl(&mount_lock);

2855

read_sequnlock_excl(&mount_lock);

2853

return res;

2856

return res;

2854

}

2857

}

2855

EXPORT_SYMBOL(path_is_under);

2858

EXPORT_SYMBOL(path_is_under);

2856

2859

2857

/*

2860

/*

2858

* pivot_root Semantics:

2861

* pivot_root Semantics:

2859

* Moves the root file system of the current process to the directory put_old,

2862

* Moves the root file system of the current process to the directory put_old,

2860

* makes new_root as the new root file system of the current process, and sets

2863

* makes new_root as the new root file system of the current process, and sets

2861

* root/cwd of all processes which had them on the current root to new_root.

2864

* root/cwd of all processes which had them on the current root to new_root.

2862

*

2865

*

2863

* Restrictions:

2866

* Restrictions:

2864

* The new_root and put_old must be directories, and must not be on the

2867

* The new_root and put_old must be directories, and must not be on the

2865

* same file system as the current process root. The put_old must be

2868

* same file system as the current process root. The put_old must be

2866

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2869

* underneath new_root, i.e. adding a non-zero number of /.. to the string

2867

* pointed to by put_old must yield the same directory as new_root. No other

2870

* pointed to by put_old must yield the same directory as new_root. No other

2868

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2871

* file system may be mounted on put_old. After all, new_root is a mountpoint.

2869

*

2872

*

2870

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2873

* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.

2871

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2874

* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives

2872

* in this situation.

2875

* in this situation.

2873

*

2876

*

2874

* Notes:

2877

* Notes:

2875

* - we don't move root/cwd if they are not at the root (reason: if something

2878

* - we don't move root/cwd if they are not at the root (reason: if something

2876

* cared enough to change them, it's probably wrong to force them elsewhere)

2879

* cared enough to change them, it's probably wrong to force them elsewhere)

2877

* - it's okay to pick a root that isn't the root of a file system, e.g.

2880

* - it's okay to pick a root that isn't the root of a file system, e.g.

2878

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2881

* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,

2879

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2882

* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root

2880

* first.

2883

* first.

2881

*/

2884

*/

2882

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2885

SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,

2883

const char __user *, put_old)

2886

const char __user *, put_old)

2884

{

2887

{

2885

struct path new, old, parent_path, root_parent, root;

2888

struct path new, old, parent_path, root_parent, root;

2886

struct mount *new_mnt, *root_mnt, *old_mnt;

2889

struct mount *new_mnt, *root_mnt, *old_mnt;

2887

struct mountpoint *old_mp, *root_mp;

2890

struct mountpoint *old_mp, *root_mp;

2888

int error;

2891

int error;

2889

2892

2890

if (!may_mount())

2893

if (!may_mount())

2891

return -EPERM;

2894

return -EPERM;

2892

2895

2893

error = user_path_dir(new_root, &new);

2896

error = user_path_dir(new_root, &new);

2894

if (error)

2897

if (error)

2895

goto out0;

2898

goto out0;

2896

2899

2897

error = user_path_dir(put_old, &old);

2900

error = user_path_dir(put_old, &old);

2898

if (error)

2901

if (error)

2899

goto out1;

2902

goto out1;

2900

2903

2901

error = security_sb_pivotroot(&old, &new);

2904

error = security_sb_pivotroot(&old, &new);

2902

if (error)

2905

if (error)

2903

goto out2;

2906

goto out2;

2904

2907

2905

get_fs_root(current->fs, &root);

2908

get_fs_root(current->fs, &root);

2906

old_mp = lock_mount(&old);

2909

old_mp = lock_mount(&old);

2907

error = PTR_ERR(old_mp);

2910

error = PTR_ERR(old_mp);

2908

if (IS_ERR(old_mp))

2911

if (IS_ERR(old_mp))

2909

goto out3;

2912

goto out3;

2910

2913

2911

error = -EINVAL;

2914

error = -EINVAL;

2912

new_mnt = real_mount(new.mnt);

2915

new_mnt = real_mount(new.mnt);

2913

root_mnt = real_mount(root.mnt);

2916

root_mnt = real_mount(root.mnt);

2914

old_mnt = real_mount(old.mnt);

2917

old_mnt = real_mount(old.mnt);

2915

if (IS_MNT_SHARED(old_mnt) ||

2918

if (IS_MNT_SHARED(old_mnt) ||

2916

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2919

IS_MNT_SHARED(new_mnt->mnt_parent) ||

2917

IS_MNT_SHARED(root_mnt->mnt_parent))

2920

IS_MNT_SHARED(root_mnt->mnt_parent))

2918

goto out4;

2921

goto out4;

2919

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2922

if (!check_mnt(root_mnt) || !check_mnt(new_mnt))

2920

goto out4;

2923

goto out4;

2921

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2924

if (new_mnt->mnt.mnt_flags & MNT_LOCKED)

2922

goto out4;

2925

goto out4;

2923

error = -ENOENT;

2926

error = -ENOENT;

2924

if (d_unlinked(new.dentry))

2927

if (d_unlinked(new.dentry))

2925

goto out4;

2928

goto out4;

2926

error = -EBUSY;

2929

error = -EBUSY;

2927

if (new_mnt == root_mnt || old_mnt == root_mnt)

2930

if (new_mnt == root_mnt || old_mnt == root_mnt)

2928

goto out4; /* loop, on the same file system */

2931

goto out4; /* loop, on the same file system */

2929

error = -EINVAL;

2932

error = -EINVAL;

2930

if (root.mnt->mnt_root != root.dentry)

2933

if (root.mnt->mnt_root != root.dentry)

2931

goto out4; /* not a mountpoint */

2934

goto out4; /* not a mountpoint */

2932

if (!mnt_has_parent(root_mnt))

2935

if (!mnt_has_parent(root_mnt))

2933

goto out4; /* not attached */

2936

goto out4; /* not attached */

2934

root_mp = root_mnt->mnt_mp;

2937

root_mp = root_mnt->mnt_mp;

2935

if (new.mnt->mnt_root != new.dentry)

2938

if (new.mnt->mnt_root != new.dentry)

2936

goto out4; /* not a mountpoint */

2939

goto out4; /* not a mountpoint */

2937

if (!mnt_has_parent(new_mnt))

2940

if (!mnt_has_parent(new_mnt))

2938

goto out4; /* not attached */

2941

goto out4; /* not attached */

2939

/* make sure we can reach put_old from new_root */

2942

/* make sure we can reach put_old from new_root */

2940

if (!is_path_reachable(old_mnt, old.dentry, &new))

2943

if (!is_path_reachable(old_mnt, old.dentry, &new))

2941

goto out4;

2944

goto out4;

2942

/* make certain new is below the root */

2945

/* make certain new is below the root */

2943

if (!is_path_reachable(new_mnt, new.dentry, &root))

2946

if (!is_path_reachable(new_mnt, new.dentry, &root))

2944

goto out4;

2947

goto out4;

2945

root_mp->m_count++; /* pin it so it won't go away */

2948

root_mp->m_count++; /* pin it so it won't go away */

2946

lock_mount_hash();

2949

lock_mount_hash();

2947

detach_mnt(new_mnt, &parent_path);

2950

detach_mnt(new_mnt, &parent_path);

2948

detach_mnt(root_mnt, &root_parent);

2951

detach_mnt(root_mnt, &root_parent);

2949

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2952

if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {

2950

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2953

new_mnt->mnt.mnt_flags |= MNT_LOCKED;

2951

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2954

root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;

2952

}

2955

}

2953

/* mount old root on put_old */

2956

/* mount old root on put_old */

2954

attach_mnt(root_mnt, old_mnt, old_mp);

2957

attach_mnt(root_mnt, old_mnt, old_mp);

2955

/* mount new_root on / */

2958

/* mount new_root on / */

2956

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2959

attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);

2957

touch_mnt_namespace(current->nsproxy->mnt_ns);

2960

touch_mnt_namespace(current->nsproxy->mnt_ns);

2958

unlock_mount_hash();

2961

unlock_mount_hash();

2959

chroot_fs_refs(&root, &new);

2962

chroot_fs_refs(&root, &new);

2960

put_mountpoint(root_mp);

2963

put_mountpoint(root_mp);

2961

error = 0;

2964

error = 0;

2962

out4:

2965

out4:

2963

unlock_mount(old_mp);

2966

unlock_mount(old_mp);

2964

if (!error) {

2967

if (!error) {

2965

path_put(&root_parent);

2968

path_put(&root_parent);

2966

path_put(&parent_path);

2969

path_put(&parent_path);

2967

}

2970

}

2968

out3:

2971

out3:

2969

path_put(&root);

2972

path_put(&root);

2970

out2:

2973

out2:

2971

path_put(&old);

2974

path_put(&old);

2972

out1:

2975

out1:

2973

path_put(&new);

2976

path_put(&new);

2974

out0:

2977

out0:

2975

return error;

2978

return error;

2976

}

2979

}

2977

2980

2978

static void __init init_mount_tree(void)

2981

static void __init init_mount_tree(void)

2979

{

2982

{

2980

struct vfsmount *mnt;

2983

struct vfsmount *mnt;

2981

struct mnt_namespace *ns;

2984

struct mnt_namespace *ns;

2982

struct path root;

2985

struct path root;

2983

struct file_system_type *type;

2986

struct file_system_type *type;

2984

2987

2985

type = get_fs_type("rootfs");

2988

type = get_fs_type("rootfs");

2986

if (!type)

2989

if (!type)

2987

panic("Can't find rootfs type");

2990

panic("Can't find rootfs type");

2988

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2991

mnt = vfs_kern_mount(type, 0, "rootfs", NULL);

2989

put_filesystem(type);

2992

put_filesystem(type);

2990

if (IS_ERR(mnt))

2993

if (IS_ERR(mnt))

2991

panic("Can't create rootfs");

2994

panic("Can't create rootfs");

2992

2995

2993

ns = create_mnt_ns(mnt);

2996

ns = create_mnt_ns(mnt);

2994

if (IS_ERR(ns))

2997

if (IS_ERR(ns))

2995

panic("Can't allocate initial namespace");

2998

panic("Can't allocate initial namespace");

2996

2999

2997

init_task.nsproxy->mnt_ns = ns;

3000

init_task.nsproxy->mnt_ns = ns;

2998

get_mnt_ns(ns);

3001

get_mnt_ns(ns);

2999

3002

3000

root.mnt = mnt;

3003

root.mnt = mnt;

3001

root.dentry = mnt->mnt_root;

3004

root.dentry = mnt->mnt_root;

3002

3005

3003

set_fs_pwd(current->fs, &root);

3006

set_fs_pwd(current->fs, &root);

3004

set_fs_root(current->fs, &root);

3007

set_fs_root(current->fs, &root);

3005

}

3008

}

3006

3009

3007

void __init mnt_init(void)

3010

void __init mnt_init(void)

3008

{

3011

{

3009

unsigned u;

3012

unsigned u;

3010

int err;

3013

int err;

3011

3014

3012

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3015

mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),

3013

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3016

0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

3014

3017

3015

mount_hashtable = alloc_large_system_hash("Mount-cache",

3018

mount_hashtable = alloc_large_system_hash("Mount-cache",

3016

sizeof(struct hlist_head),

3019

sizeof(struct hlist_head),

3017

mhash_entries, 19,

3020

mhash_entries, 19,

3018

0,

3021

0,

3019

&m_hash_shift, &m_hash_mask, 0, 0);

3022

&m_hash_shift, &m_hash_mask, 0, 0);

3020

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3023

mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",

3021

sizeof(struct hlist_head),

3024

sizeof(struct hlist_head),

3022

mphash_entries, 19,

3025

mphash_entries, 19,

3023

0,

3026

0,

3024

&mp_hash_shift, &mp_hash_mask, 0, 0);

3027

&mp_hash_shift, &mp_hash_mask, 0, 0);

3025

3028

3026

if (!mount_hashtable || !mountpoint_hashtable)

3029

if (!mount_hashtable || !mountpoint_hashtable)

3027

panic("Failed to allocate mount hash table\n");

3030

panic("Failed to allocate mount hash table\n");

3028

3031

3029

for (u = 0; u <= m_hash_mask; u++)

3032

for (u = 0; u <= m_hash_mask; u++)

3030

INIT_HLIST_HEAD(&mount_hashtable[u]);

3033

INIT_HLIST_HEAD(&mount_hashtable[u]);

3031

for (u = 0; u <= mp_hash_mask; u++)

3034

for (u = 0; u <= mp_hash_mask; u++)

3032

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3035

INIT_HLIST_HEAD(&mountpoint_hashtable[u]);

3033

3036

3034

kernfs_init();

3037

kernfs_init();

3035

3038

3036

err = sysfs_init();

3039

err = sysfs_init();

3037

if (err)

3040

if (err)

3038

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3041

printk(KERN_WARNING "%s: sysfs_init error: %d\n",

3039

__func__, err);

3042

__func__, err);

3040

fs_kobj = kobject_create_and_add("fs", NULL);

3043

fs_kobj = kobject_create_and_add("fs", NULL);

3041

if (!fs_kobj)

3044

if (!fs_kobj)

3042

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3045

printk(KERN_WARNING "%s: kobj create error\n", __func__);

3043

init_rootfs();

3046

init_rootfs();

3044

init_mount_tree();

3047

init_mount_tree();

3045

}

3048

}

3046

3049

3047

void put_mnt_ns(struct mnt_namespace *ns)

3050

void put_mnt_ns(struct mnt_namespace *ns)

3048

{

3051

{

3049

if (!atomic_dec_and_test(&ns->count))

3052

if (!atomic_dec_and_test(&ns->count))

3050

return;

3053

return;

3051

drop_collected_mounts(&ns->root->mnt);

3054

drop_collected_mounts(&ns->root->mnt);

3052

free_mnt_ns(ns);

3055

free_mnt_ns(ns);

3053

}

3056

}

3054

3057

3055

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3058

struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)

3056

{

3059

{

3057

struct vfsmount *mnt;

3060

struct vfsmount *mnt;

3058

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3061

mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);

3059

if (!IS_ERR(mnt)) {

3062

if (!IS_ERR(mnt)) {

3060

/*

3063

/*

3061

* it is a longterm mount, don't release mnt until

3064

* it is a longterm mount, don't release mnt until

3062

* we unmount before file sys is unregistered

3065

* we unmount before file sys is unregistered

3063

*/

3066

*/

3064

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3067

real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;

3065

}

3068

}

3066

return mnt;

3069

return mnt;

3067

}

3070

}

3068

EXPORT_SYMBOL_GPL(kern_mount_data);

3071

EXPORT_SYMBOL_GPL(kern_mount_data);

3069

3072

3070

void kern_unmount(struct vfsmount *mnt)

3073

void kern_unmount(struct vfsmount *mnt)

3071

{

3074

{

3072

/* release long term mount so mount point can be released */

3075

/* release long term mount so mount point can be released */

3073

if (!IS_ERR_OR_NULL(mnt)) {

3076

if (!IS_ERR_OR_NULL(mnt)) {

3074

real_mount(mnt)->mnt_ns = NULL;

3077

real_mount(mnt)->mnt_ns = NULL;

3075

synchronize_rcu(); /* yecchhh... */

3078

synchronize_rcu(); /* yecchhh... */

3076

mntput(mnt);

3079

mntput(mnt);

3077

}

3080

}

3078

}

3081

}

3079

EXPORT_SYMBOL(kern_unmount);

3082

EXPORT_SYMBOL(kern_unmount);

3080

3083

3081

bool our_mnt(struct vfsmount *mnt)

3084

bool our_mnt(struct vfsmount *mnt)

3082

{

3085

{

3083

return check_mnt(real_mount(mnt));

3086

return check_mnt(real_mount(mnt));

3084

}

3087

}

3085

3088

3086

bool current_chrooted(void)

3089

bool current_chrooted(void)

3087

{

3090

{

3088

/* Does the current process have a non-standard root */

3091

/* Does the current process have a non-standard root */

3089

struct path ns_root;

3092

struct path ns_root;

3090

struct path fs_root;

3093

struct path fs_root;

3091

bool chrooted;

3094

bool chrooted;

3092

3095

3093

/* Find the namespace root */

3096

/* Find the namespace root */

3094

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3097

ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;

3095

ns_root.dentry = ns_root.mnt->mnt_root;

3098

ns_root.dentry = ns_root.mnt->mnt_root;

3096

path_get(&ns_root);

3099

path_get(&ns_root);

3097

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3100

while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))

3098

;

3101

;

3099

3102

3100

get_fs_root(current->fs, &fs_root);

3103

get_fs_root(current->fs, &fs_root);

3101

3104

3102

chrooted = !path_equal(&fs_root, &ns_root);

3105

chrooted = !path_equal(&fs_root, &ns_root);

3103

3106

3104

path_put(&fs_root);

3107

path_put(&fs_root);

3105

path_put(&ns_root);

3108

path_put(&ns_root);

3106

3109

3107

return chrooted;

3110

return chrooted;

3108

}

3111

}

3109

3112

3110

bool fs_fully_visible(struct file_system_type *type)

3113

bool fs_fully_visible(struct file_system_type *type)

3111

{

3114

{

3112

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3115

struct mnt_namespace *ns = current->nsproxy->mnt_ns;

3113

struct mount *mnt;

3116

struct mount *mnt;

3114

bool visible = false;

3117

bool visible = false;

3115

3118

3116

if (unlikely(!ns))

3119

if (unlikely(!ns))

3117

return false;

3120

return false;

3118

3121

3119

down_read(&namespace_sem);

3122

down_read(&namespace_sem);

3120

list_for_each_entry(mnt, &ns->list, mnt_list) {

3123

list_for_each_entry(mnt, &ns->list, mnt_list) {

3121

struct mount *child;

3124

struct mount *child;

3122

if (mnt->mnt.mnt_sb->s_type != type)

3125

if (mnt->mnt.mnt_sb->s_type != type)

3123

continue;

3126

continue;

3124

3127

3125

/* This mount is not fully visible if there are any child mounts

3128

/* This mount is not fully visible if there are any child mounts

3126

* that cover anything except for empty directories.

3129

* that cover anything except for empty directories.

3127

*/

3130

*/

3128

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3131

list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {

3129

struct inode *inode = child->mnt_mountpoint->d_inode;

3132

struct inode *inode = child->mnt_mountpoint->d_inode;

3130

if (!S_ISDIR(inode->i_mode))

3133

if (!S_ISDIR(inode->i_mode))

3131

goto next;

3134

goto next;

3132

if (inode->i_nlink > 2)

3135

if (inode->i_nlink > 2)

3133

goto next;

3136

goto next;

3134

}

3137

}

3135

visible = true;

3138

visible = true;

3136

goto found;

3139

goto found;

3137

next: ;

3140

next: ;

3138

}

3141

}

3139

found:

3142

found:

3140

up_read(&namespace_sem);

3143

up_read(&namespace_sem);

3141

return visible;

3144

return visible;

3142

}

3145

}

3143

3146

3144

static struct ns_common *mntns_get(struct task_struct *task)

3147

static struct ns_common *mntns_get(struct task_struct *task)

3145

{

3148

{

3146

struct ns_common *ns = NULL;

3149

struct ns_common *ns = NULL;

3147

struct nsproxy *nsproxy;

3150

struct nsproxy *nsproxy;

3148

3151

3149

task_lock(task);

3152

task_lock(task);

3150

nsproxy = task->nsproxy;

3153

nsproxy = task->nsproxy;

3151

if (nsproxy) {

3154

if (nsproxy) {

3152

ns = &nsproxy->mnt_ns->ns;

3155

ns = &nsproxy->mnt_ns->ns;

3153

get_mnt_ns(to_mnt_ns(ns));

3156

get_mnt_ns(to_mnt_ns(ns));

3154

}

3157

}

3155

task_unlock(task);

3158

task_unlock(task);

3156

3159

3157

return ns;

3160

return ns;

3158

}

3161

}

3159

3162

3160

static void mntns_put(struct ns_common *ns)

3163

static void mntns_put(struct ns_common *ns)

3161

{

3164

{

3162

put_mnt_ns(to_mnt_ns(ns));

3165

put_mnt_ns(to_mnt_ns(ns));

3163

}

3166

}

3164

3167

3165

static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)

3168

static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)

3166

{

3169

{

3167

struct fs_struct *fs = current->fs;

3170

struct fs_struct *fs = current->fs;

3168

struct mnt_namespace *mnt_ns = to_mnt_ns(ns);

3171

struct mnt_namespace *mnt_ns = to_mnt_ns(ns);

3169

struct path root;

3172

struct path root;

3170

3173

3171

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3174

if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||

3172

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3175

!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||

3173

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3176

!ns_capable(current_user_ns(), CAP_SYS_ADMIN))

3174

return -EPERM;

3177

return -EPERM;

3175

3178

3176

if (fs->users != 1)

3179

if (fs->users != 1)

3177

return -EINVAL;

3180

return -EINVAL;

3178

3181

3179

get_mnt_ns(mnt_ns);

3182

get_mnt_ns(mnt_ns);

3180

put_mnt_ns(nsproxy->mnt_ns);

3183

put_mnt_ns(nsproxy->mnt_ns);

3181

nsproxy->mnt_ns = mnt_ns;

3184

nsproxy->mnt_ns = mnt_ns;

3182

3185

3183

/* Find the root */

3186

/* Find the root */

3184

root.mnt = &mnt_ns->root->mnt;

3187

root.mnt = &mnt_ns->root->mnt;

3185

root.dentry = mnt_ns->root->mnt.mnt_root;

3188

root.dentry = mnt_ns->root->mnt.mnt_root;

3186

path_get(&root);

3189

path_get(&root);

3187

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3190

while(d_mountpoint(root.dentry) && follow_down_one(&root))

3188

;

3191

;

3189

3192

3190

/* Update the pwd and root */

3193

/* Update the pwd and root */

3191

set_fs_pwd(fs, &root);

3194

set_fs_pwd(fs, &root);

3192

set_fs_root(fs, &root);

3195

set_fs_root(fs, &root);

3193

3196

3194

path_put(&root);

3197

path_put(&root);

3195

return 0;

3198

return 0;

3196

}

3199

}

3197

3200

3198

const struct proc_ns_operations mntns_operations = {

3201

const struct proc_ns_operations mntns_operations = {

3199

.name = "mnt",

3202

.name = "mnt",

3200

.type = CLONE_NEWNS,

3203

.type = CLONE_NEWNS,

3201

.get = mntns_get,

3204

.get = mntns_get,

3202

.put = mntns_put,

3205

.put = mntns_put,

3203

.install = mntns_install,

3206

.install = mntns_install,

3204

};

3207

};

3205

3208

 #ifndef _LINUX_NS_COMMON_H
 #define _LINUX_NS_COMMON_H
 struct proc_ns_operations;
 struct ns_common {
+	atomic_long_t stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
 };
 #endif

GITLAB

take the targets of /proc//ns/ symlinks to separate fs

1	#	1	#
2	# Makefile for the Linux filesystems.	2	# Makefile for the Linux filesystems.
3	#	3	#
4	# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>	4	# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
5	# Rewritten to use lists instead of if-statements.	5	# Rewritten to use lists instead of if-statements.
6	#	6	#
7		7
8	obj-y := open.o read_write.o file_table.o super.o \	8	obj-y := open.o read_write.o file_table.o super.o \
9	char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \	9	char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
10	ioctl.o readdir.o select.o dcache.o inode.o \	10	ioctl.o readdir.o select.o dcache.o inode.o \
11	attr.o bad_inode.o file.o filesystems.o namespace.o \	11	attr.o bad_inode.o file.o filesystems.o namespace.o \
12	seq_file.o xattr.o libfs.o fs-writeback.o \	12	seq_file.o xattr.o libfs.o fs-writeback.o \
13	pnode.o splice.o sync.o utimes.o \	13	pnode.o splice.o sync.o utimes.o \
14	stack.o fs_struct.o statfs.o fs_pin.o	14	stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
15		15
16	ifeq ($(CONFIG_BLOCK),y)	16	ifeq ($(CONFIG_BLOCK),y)
17	obj-y += buffer.o block_dev.o direct-io.o mpage.o	17	obj-y += buffer.o block_dev.o direct-io.o mpage.o
18	else	18	else
19	obj-y += no-block.o	19	obj-y += no-block.o
20	endif	20	endif
21		21
22	obj-$(CONFIG_PROC_FS) += proc_namespace.o	22	obj-$(CONFIG_PROC_FS) += proc_namespace.o
23		23
24	obj-y += notify/	24	obj-y += notify/
25	obj-$(CONFIG_EPOLL) += eventpoll.o	25	obj-$(CONFIG_EPOLL) += eventpoll.o
26	obj-$(CONFIG_ANON_INODES) += anon_inodes.o	26	obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27	obj-$(CONFIG_SIGNALFD) += signalfd.o	27	obj-$(CONFIG_SIGNALFD) += signalfd.o
28	obj-$(CONFIG_TIMERFD) += timerfd.o	28	obj-$(CONFIG_TIMERFD) += timerfd.o
29	obj-$(CONFIG_EVENTFD) += eventfd.o	29	obj-$(CONFIG_EVENTFD) += eventfd.o
30	obj-$(CONFIG_AIO) += aio.o	30	obj-$(CONFIG_AIO) += aio.o
31	obj-$(CONFIG_FILE_LOCKING) += locks.o	31	obj-$(CONFIG_FILE_LOCKING) += locks.o
32	obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o	32	obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
33	obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o	33	obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
34	obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o	34	obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
35	obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o	35	obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
36	obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o	36	obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
37	obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o	37	obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
38	obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o	38	obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
39	obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o	39	obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
40	obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o	40	obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
41	obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o	41	obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
42		42
43	obj-$(CONFIG_FS_MBCACHE) += mbcache.o	43	obj-$(CONFIG_FS_MBCACHE) += mbcache.o
44	obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o	44	obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
45	obj-$(CONFIG_NFS_COMMON) += nfs_common/	45	obj-$(CONFIG_NFS_COMMON) += nfs_common/
46	obj-$(CONFIG_COREDUMP) += coredump.o	46	obj-$(CONFIG_COREDUMP) += coredump.o
47	obj-$(CONFIG_SYSCTL) += drop_caches.o	47	obj-$(CONFIG_SYSCTL) += drop_caches.o
48		48
49	obj-$(CONFIG_FHANDLE) += fhandle.o	49	obj-$(CONFIG_FHANDLE) += fhandle.o
50		50
51	obj-y += quota/	51	obj-y += quota/
52		52
53	obj-$(CONFIG_PROC_FS) += proc/	53	obj-$(CONFIG_PROC_FS) += proc/
54	obj-$(CONFIG_KERNFS) += kernfs/	54	obj-$(CONFIG_KERNFS) += kernfs/
55	obj-$(CONFIG_SYSFS) += sysfs/	55	obj-$(CONFIG_SYSFS) += sysfs/
56	obj-$(CONFIG_CONFIGFS_FS) += configfs/	56	obj-$(CONFIG_CONFIGFS_FS) += configfs/
57	obj-y += devpts/	57	obj-y += devpts/
58		58
59	obj-$(CONFIG_PROFILING) += dcookies.o	59	obj-$(CONFIG_PROFILING) += dcookies.o
60	obj-$(CONFIG_DLM) += dlm/	60	obj-$(CONFIG_DLM) += dlm/
61		61
62	# Do not add any filesystems before this line	62	# Do not add any filesystems before this line
63	obj-$(CONFIG_FSCACHE) += fscache/	63	obj-$(CONFIG_FSCACHE) += fscache/
64	obj-$(CONFIG_REISERFS_FS) += reiserfs/	64	obj-$(CONFIG_REISERFS_FS) += reiserfs/
65	obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3	65	obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
66	obj-$(CONFIG_EXT2_FS) += ext2/	66	obj-$(CONFIG_EXT2_FS) += ext2/
67	# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2	67	# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
68	# unless explicitly requested by rootfstype	68	# unless explicitly requested by rootfstype
69	obj-$(CONFIG_EXT4_FS) += ext4/	69	obj-$(CONFIG_EXT4_FS) += ext4/
70	obj-$(CONFIG_JBD) += jbd/	70	obj-$(CONFIG_JBD) += jbd/
71	obj-$(CONFIG_JBD2) += jbd2/	71	obj-$(CONFIG_JBD2) += jbd2/
72	obj-$(CONFIG_CRAMFS) += cramfs/	72	obj-$(CONFIG_CRAMFS) += cramfs/
73	obj-$(CONFIG_SQUASHFS) += squashfs/	73	obj-$(CONFIG_SQUASHFS) += squashfs/
74	obj-y += ramfs/	74	obj-y += ramfs/
75	obj-$(CONFIG_HUGETLBFS) += hugetlbfs/	75	obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
76	obj-$(CONFIG_CODA_FS) += coda/	76	obj-$(CONFIG_CODA_FS) += coda/
77	obj-$(CONFIG_MINIX_FS) += minix/	77	obj-$(CONFIG_MINIX_FS) += minix/
78	obj-$(CONFIG_FAT_FS) += fat/	78	obj-$(CONFIG_FAT_FS) += fat/
79	obj-$(CONFIG_BFS_FS) += bfs/	79	obj-$(CONFIG_BFS_FS) += bfs/
80	obj-$(CONFIG_ISO9660_FS) += isofs/	80	obj-$(CONFIG_ISO9660_FS) += isofs/
81	obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+	81	obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
82	obj-$(CONFIG_HFS_FS) += hfs/	82	obj-$(CONFIG_HFS_FS) += hfs/
83	obj-$(CONFIG_ECRYPT_FS) += ecryptfs/	83	obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
84	obj-$(CONFIG_VXFS_FS) += freevxfs/	84	obj-$(CONFIG_VXFS_FS) += freevxfs/
85	obj-$(CONFIG_NFS_FS) += nfs/	85	obj-$(CONFIG_NFS_FS) += nfs/
86	obj-$(CONFIG_EXPORTFS) += exportfs/	86	obj-$(CONFIG_EXPORTFS) += exportfs/
87	obj-$(CONFIG_NFSD) += nfsd/	87	obj-$(CONFIG_NFSD) += nfsd/
88	obj-$(CONFIG_LOCKD) += lockd/	88	obj-$(CONFIG_LOCKD) += lockd/
89	obj-$(CONFIG_NLS) += nls/	89	obj-$(CONFIG_NLS) += nls/
90	obj-$(CONFIG_SYSV_FS) += sysv/	90	obj-$(CONFIG_SYSV_FS) += sysv/
91	obj-$(CONFIG_CIFS) += cifs/	91	obj-$(CONFIG_CIFS) += cifs/
92	obj-$(CONFIG_NCP_FS) += ncpfs/	92	obj-$(CONFIG_NCP_FS) += ncpfs/
93	obj-$(CONFIG_HPFS_FS) += hpfs/	93	obj-$(CONFIG_HPFS_FS) += hpfs/
94	obj-$(CONFIG_NTFS_FS) += ntfs/	94	obj-$(CONFIG_NTFS_FS) += ntfs/
95	obj-$(CONFIG_UFS_FS) += ufs/	95	obj-$(CONFIG_UFS_FS) += ufs/
96	obj-$(CONFIG_EFS_FS) += efs/	96	obj-$(CONFIG_EFS_FS) += efs/
97	obj-$(CONFIG_JFFS2_FS) += jffs2/	97	obj-$(CONFIG_JFFS2_FS) += jffs2/
98	obj-$(CONFIG_LOGFS) += logfs/	98	obj-$(CONFIG_LOGFS) += logfs/
99	obj-$(CONFIG_UBIFS_FS) += ubifs/	99	obj-$(CONFIG_UBIFS_FS) += ubifs/
100	obj-$(CONFIG_AFFS_FS) += affs/	100	obj-$(CONFIG_AFFS_FS) += affs/
101	obj-$(CONFIG_ROMFS_FS) += romfs/	101	obj-$(CONFIG_ROMFS_FS) += romfs/
102	obj-$(CONFIG_QNX4FS_FS) += qnx4/	102	obj-$(CONFIG_QNX4FS_FS) += qnx4/
103	obj-$(CONFIG_QNX6FS_FS) += qnx6/	103	obj-$(CONFIG_QNX6FS_FS) += qnx6/
104	obj-$(CONFIG_AUTOFS4_FS) += autofs4/	104	obj-$(CONFIG_AUTOFS4_FS) += autofs4/
105	obj-$(CONFIG_ADFS_FS) += adfs/	105	obj-$(CONFIG_ADFS_FS) += adfs/
106	obj-$(CONFIG_FUSE_FS) += fuse/	106	obj-$(CONFIG_FUSE_FS) += fuse/
107	obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/	107	obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
108	obj-$(CONFIG_UDF_FS) += udf/	108	obj-$(CONFIG_UDF_FS) += udf/
109	obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/	109	obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
110	obj-$(CONFIG_OMFS_FS) += omfs/	110	obj-$(CONFIG_OMFS_FS) += omfs/
111	obj-$(CONFIG_JFS_FS) += jfs/	111	obj-$(CONFIG_JFS_FS) += jfs/
112	obj-$(CONFIG_XFS_FS) += xfs/	112	obj-$(CONFIG_XFS_FS) += xfs/
113	obj-$(CONFIG_9P_FS) += 9p/	113	obj-$(CONFIG_9P_FS) += 9p/
114	obj-$(CONFIG_AFS_FS) += afs/	114	obj-$(CONFIG_AFS_FS) += afs/
115	obj-$(CONFIG_NILFS2_FS) += nilfs2/	115	obj-$(CONFIG_NILFS2_FS) += nilfs2/
116	obj-$(CONFIG_BEFS_FS) += befs/	116	obj-$(CONFIG_BEFS_FS) += befs/
117	obj-$(CONFIG_HOSTFS) += hostfs/	117	obj-$(CONFIG_HOSTFS) += hostfs/
118	obj-$(CONFIG_HPPFS) += hppfs/	118	obj-$(CONFIG_HPPFS) += hppfs/
119	obj-$(CONFIG_CACHEFILES) += cachefiles/	119	obj-$(CONFIG_CACHEFILES) += cachefiles/
120	obj-$(CONFIG_DEBUG_FS) += debugfs/	120	obj-$(CONFIG_DEBUG_FS) += debugfs/
121	obj-$(CONFIG_OCFS2_FS) += ocfs2/	121	obj-$(CONFIG_OCFS2_FS) += ocfs2/
122	obj-$(CONFIG_BTRFS_FS) += btrfs/	122	obj-$(CONFIG_BTRFS_FS) += btrfs/
123	obj-$(CONFIG_GFS2_FS) += gfs2/	123	obj-$(CONFIG_GFS2_FS) += gfs2/
124	obj-$(CONFIG_F2FS_FS) += f2fs/	124	obj-$(CONFIG_F2FS_FS) += f2fs/
125	obj-y += exofs/ # Multiple modules	125	obj-y += exofs/ # Multiple modules
126	obj-$(CONFIG_CEPH_FS) += ceph/	126	obj-$(CONFIG_CEPH_FS) += ceph/
127	obj-$(CONFIG_PSTORE) += pstore/	127	obj-$(CONFIG_PSTORE) += pstore/
128	obj-$(CONFIG_EFIVAR_FS) += efivarfs/	128	obj-$(CONFIG_EFIVAR_FS) += efivarfs/
129		129

 /* fs/ internal definitions
  *
  * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 struct super_block;
 struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
 /*
  * block_dev.c
  */
 #ifdef CONFIG_BLOCK
 extern void __init bdev_cache_init(void);
 extern int __sync_blockdev(struct block_device *bdev, int wait);
 #else
 static inline void bdev_cache_init(void)
 {
 }
 static inline int __sync_blockdev(struct block_device *bdev, int wait)
 {
 	return 0;
 }
 #endif
 /*
  * buffer.c
  */
 extern void guard_bio_eod(int rw, struct bio *bio);
 /*
  * char_dev.c
  */
 extern void __init chrdev_init(void);
 /*
  * namei.c
  */
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
 /*
  * namespace.c
  */
 extern int copy_mount_options(const void __user *, unsigned long *);
 extern char *copy_mount_string(const void __user *);
 extern struct vfsmount *lookup_mnt(struct path *);
 extern int finish_automount(struct vfsmount *, struct path *);
 extern int sb_prepare_remount_readonly(struct super_block *);
 extern void __init mnt_init(void);
 extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
 extern void __mnt_drop_write(struct vfsmount *);
 extern void __mnt_drop_write_file(struct file *);
 /*
  * fs_struct.c
  */
 extern void chroot_fs_refs(const struct path *, const struct path *);
 /*
  * file_table.c
  */
 extern struct file *get_empty_filp(void);
 /*
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern bool grab_super_passive(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
 			       int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
 /*
  * open.c
  */
 struct open_flags {
 	int open_flag;
 	umode_t mode;
 	int acc_mode;
 	int intent;
 	int lookup_flags;
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *);
 extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
 /*
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
 extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
 			    int nid);
 extern void inode_add_lru(struct inode *inode);
 /*
  * fs-writeback.c
  */
 extern void inode_wb_list_del(struct inode *inode);
 extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 /*
  * dcache.c
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 			    int nid);
 /*
  * read_write.c
  */
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 /*
  * pipe.c
  */
 extern const struct file_operations pipefifo_fops;
 /*
  * fs_pin.c
  */
 extern void sb_pin_kill(struct super_block *sb);
 extern void mnt_pin_kill(struct mount *m);
+/*
+ * fs/nsfs.c
+ */
+extern struct dentry_operations ns_dentry_operations;

 /*
  *  linux/fs/namespace.c
  *
  * (C) Copyright Al Viro 2000, 2001
  *	Released under GPL v2.
  *
  * Based on code from fs/super.c, copyright Linus Torvalds and others.
  * Heavily rewritten.
  */
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
 #include "pnode.h"
 #include "internal.h"
 static unsigned int m_hash_mask __read_mostly;
 static unsigned int m_hash_shift __read_mostly;
 static unsigned int mp_hash_mask __read_mostly;
 static unsigned int mp_hash_shift __read_mostly;
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mhash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mhash_entries=", set_mhash_entries);
 static __initdata unsigned long mphash_entries;
 static int __init set_mphash_entries(char *str)
 {
 	if (!str)
 		return 0;
 	mphash_entries = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("mphash_entries=", set_mphash_entries);
 static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 /*
  * vfsmount lock may be taken for read to prevent changes to the
  * vfsmount hash, ie. during mountpoint lookups or walking back
  * up the tree.
  *
  * It should be taken for write in all cases where the vfsmount
  * tree or hash is modified or when a vfsmount structure is modified.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
 	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> m_hash_shift);
 	return &mount_hashtable[tmp & m_hash_mask];
 }
 static inline struct hlist_head *mp_hash(struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> mp_hash_shift);
 	return &mountpoint_hashtable[tmp & mp_hash_mask];
 }
 /*
  * allocation is serialized by namespace_sem, but we need the spinlock to
  * serialize with freeing.
  */
 static int mnt_alloc_id(struct mount *mnt)
 {
 	int res;
 retry:
 	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
 	spin_lock(&mnt_id_lock);
 	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
 	if (!res)
 		mnt_id_start = mnt->mnt_id + 1;
 	spin_unlock(&mnt_id_lock);
 	if (res == -EAGAIN)
 		goto retry;
 	return res;
 }
 static void mnt_free_id(struct mount *mnt)
 {
 	int id = mnt->mnt_id;
 	spin_lock(&mnt_id_lock);
 	ida_remove(&mnt_id_ida, id);
 	if (mnt_id_start > id)
 		mnt_id_start = id;
 	spin_unlock(&mnt_id_lock);
 }
 /*
  * Allocate a new peer group ID
  *
  * mnt_group_ida is protected by namespace_sem
  */
 static int mnt_alloc_group_id(struct mount *mnt)
 {
 	int res;
 	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
 		return -ENOMEM;
 	res = ida_get_new_above(&mnt_group_ida,
 				mnt_group_start,
 				&mnt->mnt_group_id);
 	if (!res)
 		mnt_group_start = mnt->mnt_group_id + 1;
 	return res;
 }
 /*
  * Release a peer group ID
  */
 void mnt_release_group_id(struct mount *mnt)
 {
 	int id = mnt->mnt_group_id;
 	ida_remove(&mnt_group_ida, id);
 	if (mnt_group_start > id)
 		mnt_group_start = id;
 	mnt->mnt_group_id = 0;
 }
 /*
  * vfsmount lock must be held for read
  */
 static inline void mnt_add_count(struct mount *mnt, int n)
 {
 #ifdef CONFIG_SMP
 	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
 #else
 	preempt_disable();
 	mnt->mnt_count += n;
 	preempt_enable();
 #endif
 }
 /*
  * vfsmount lock must be held for write
  */
 unsigned int mnt_get_count(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
 	}
 	return count;
 #else
 	return mnt->mnt_count;
 #endif
 }
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 		err = mnt_alloc_id(mnt);
 		if (err)
 			goto out_free_cache;
 		if (name) {
 			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
 #ifdef CONFIG_SMP
 		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
 		if (!mnt->mnt_pcp)
 			goto out_free_devname;
 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
 #endif
 		INIT_HLIST_NODE(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_expire);
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
 	}
 	return mnt;
 #ifdef CONFIG_SMP
 out_free_devname:
 	kfree(mnt->mnt_devname);
 #endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
 	return NULL;
 }
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
  * We must keep track of when those operations start
  * (for permission checks) and when they end, so that
  * we can determine when writes are able to occur to
  * a filesystem.
  */
 /*
  * __mnt_is_readonly: check whether a mount is read-only
  * @mnt: the mount to check for its write status
  *
  * This shouldn't be used directly ouside of the VFS.
  * It does not guarantee that the filesystem will stay
  * r/w, just that it is right *now*.  This can not and
  * should not be used in place of IS_RDONLY(inode).
  * mnt_want/drop_write() will _keep_ the filesystem
  * r/w.
  */
 int __mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_flags & MNT_READONLY)
 		return 1;
 	if (mnt->mnt_sb->s_flags & MS_RDONLY)
 		return 1;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 static inline void mnt_inc_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers++;
 #endif
 }
 static inline void mnt_dec_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers--;
 #endif
 }
 static unsigned int mnt_get_writers(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
 	}
 	return count;
 #else
 	return mnt->mnt_writers;
 #endif
 }
 static int mnt_is_readonly(struct vfsmount *mnt)
 {
 	if (mnt->mnt_sb->s_readonly_remount)
 		return 1;
 	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
 	smp_rmb();
 	return __mnt_is_readonly(mnt);
 }
 /*
  * Most r/o & frozen checks on a fs are for operations that take discrete
  * amounts of time, like a write() or unlink().  We must keep track of when
  * those operations start (for permission checks) and when they end, so that we
  * can determine when writes are able to occur to a filesystem.
  */
 /**
  * __mnt_want_write - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
  * frozen. When the write operation is finished, __mnt_drop_write() must be
  * called. This is effectively a refcount.
  */
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
 	preempt_disable();
 	mnt_inc_writers(mnt);
 	/*
 	 * The store to mnt_inc_writers must be visible before we pass
 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
 	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
 	 * be set to match its requirements. So we must not load that until
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
 	if (mnt_is_readonly(m)) {
 		mnt_dec_writers(mnt);
 		ret = -EROFS;
 	}
 	preempt_enable();
 	return ret;
 }
 /**
  * mnt_want_write - get write access to a mount
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mount is read-write, filesystem
  * is not frozen) before returning success.  When the write operation is
  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
  */
 int mnt_want_write(struct vfsmount *m)
 {
 	int ret;
 	sb_start_write(m->mnt_sb);
 	ret = __mnt_want_write(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 /**
  * mnt_clone_write - get write access to a mount
  * @mnt: the mount on which to take a write
  *
  * This is effectively like mnt_want_write, except
  * it must only be used to take an extra write reference
  * on a mountpoint that we already know has a write reference
  * on it. This allows some optimisation.
  *
  * After finished, mnt_drop_write must be called as usual to
  * drop the reference.
  */
 int mnt_clone_write(struct vfsmount *mnt)
 {
 	/* superblock may be r/o */
 	if (__mnt_is_readonly(mnt))
 		return -EROFS;
 	preempt_disable();
 	mnt_inc_writers(real_mount(mnt));
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mnt_clone_write);
 /**
  * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like __mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int __mnt_want_write_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
 		return __mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
 }
 /**
  * mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
  * This is like mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
 int mnt_want_write_file(struct file *file)
 {
 	int ret;
 	sb_start_write(file->f_path.mnt->mnt_sb);
 	ret = __mnt_want_write_file(file);
 	if (ret)
 		sb_end_write(file->f_path.mnt->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
  * __mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
  * __mnt_want_write() call above.
  */
 void __mnt_drop_write(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done performing writes to it and
  * also allows filesystem to be frozen again.  Must be matched with
  * mnt_want_write() call above.
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
 	__mnt_drop_write(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 void __mnt_drop_write_file(struct file *file)
 {
 	__mnt_drop_write(file->f_path.mnt);
 }
 void mnt_drop_write_file(struct file *file)
 {
 	mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 static int mnt_make_readonly(struct mount *mnt)
 {
 	int ret = 0;
 	lock_mount_hash();
 	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
 	 * should be visible before we do.
 	 */
 	smp_mb();
 	/*
 	 * With writers on hold, if this value is zero, then there are
 	 * definitely no active writers (although held writers may subsequently
 	 * increment the count, they'll have to wait, and decrement it after
 	 * seeing MNT_READONLY).
 	 *
 	 * It is OK to have counter incremented on one CPU and decremented on
 	 * another: the sum will add up correctly. The danger would be when we
 	 * sum up each counter, if we read a counter before it is incremented,
 	 * but then read another CPU's count which it has been subsequently
 	 * decremented from -- we would see more decrements than we should.
 	 * MNT_WRITE_HOLD protects against this scenario, because
 	 * mnt_want_write first increments count, then smp_mb, then spins on
 	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
 	 * we're counting up here.
 	 */
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
 	/*
 	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 	 * that become unheld will see MNT_READONLY.
 	 */
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	unlock_mount_hash();
 	return ret;
 }
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
 }
 int sb_prepare_remount_readonly(struct super_block *sb)
 {
 	struct mount *mnt;
 	int err = 0;
 	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 	lock_mount_hash();
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
 			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
 			}
 		}
 	}
 	if (!err && atomic_long_read(&sb->s_remove_count))
 		err = -EBUSY;
 	if (!err) {
 		sb->s_readonly_remount = 1;
 		smp_wmb();
 	}
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	}
 	unlock_mount_hash();
 	return err;
 }
 static void free_vfsmnt(struct mount *mnt)
 {
 	kfree(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
 #endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 static void delayed_free_vfsmnt(struct rcu_head *head)
 {
 	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
 }
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
 	struct mount *mnt;
 	if (read_seqretry(&mount_lock, seq))
 		return false;
 	if (bastard == NULL)
 		return true;
 	mnt = real_mount(bastard);
 	mnt_add_count(mnt, 1);
 	if (likely(!read_seqretry(&mount_lock, seq)))
 		return true;
 	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
 		mnt_add_count(mnt, -1);
 		return false;
 	}
 	rcu_read_unlock();
 	mntput(bastard);
 	rcu_read_lock();
 	return false;
 }
 /*
  * find the first mount at @dentry on vfsmount @mnt.
  * call under rcu_read_lock()
  */
 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct hlist_head *head = m_hash(mnt, dentry);
 	struct mount *p;
 	hlist_for_each_entry_rcu(p, head, mnt_hash)
 		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
 			return p;
 	return NULL;
 }
 /*
  * find the last mount at @dentry on vfsmount @mnt.
  * mount_lock must be held.
  */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct mount *p, *res;
 	res = p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
 		res = p;
 	}
 out:
 	return res;
 }
 /*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
  * following mounts:
  *
  * mount /dev/sda1 /mnt
  * mount /dev/sda2 /mnt
  * mount /dev/sda3 /mnt
  *
  * Then lookup_mnt() on the base /mnt dentry in the root mount will
  * return successively the root dentry and vfsmount of /dev/sda1, then
  * /dev/sda2, then /dev/sda3, then NULL.
  *
  * lookup_mnt takes a reference to the found vfsmount.
  */
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct mount *child_mnt;
 	struct vfsmount *m;
 	unsigned seq;
 	rcu_read_lock();
 	do {
 		seq = read_seqbegin(&mount_lock);
 		child_mnt = __lookup_mnt(path->mnt, path->dentry);
 		m = child_mnt ? &child_mnt->mnt : NULL;
 	} while (!legitimize_mnt(m, seq));
 	rcu_read_unlock();
 	return m;
 }
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
  *
  * The common case is dentries are not mountpoints at all and that
  * test is handled inline.  For the slow case when we are actually
  * dealing with a mountpoint of some kind, walk through all of the
  * mounts in the current mount namespace and test to see if the dentry
  * is a mountpoint.
  *
  * The mount_hashtable is not usable in the context because we
  * need to identify all mounts that may be in the current mount
  * namespace not just a mount that happens to have some specified
  * parent mount.
  */
 bool __is_local_mountpoint(struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool is_covered = false;
 	if (!d_mountpoint(dentry))
 		goto out;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
 	up_read(&namespace_sem);
 out:
 	return is_covered;
 }
 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	hlist_for_each_entry(mp, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
 			/* might be worth a WARN_ON() */
 			if (d_unlinked(dentry))
 				return ERR_PTR(-ENOENT);
 			mp->m_count++;
 			return mp;
 		}
 	}
 	return NULL;
 }
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
 	struct hlist_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	int ret;
 	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 	ret = d_set_mounted(dentry);
 	if (ret) {
 		kfree(mp);
 		return ERR_PTR(ret);
 	}
 	mp->m_dentry = dentry;
 	mp->m_count = 1;
 	hlist_add_head(&mp->m_hash, chain);
 	INIT_HLIST_HEAD(&mp->m_list);
 	return mp;
 }
 static void put_mountpoint(struct mountpoint *mp)
 {
 	if (!--mp->m_count) {
 		struct dentry *dentry = mp->m_dentry;
 		BUG_ON(!hlist_empty(&mp->m_list));
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
 		hlist_del(&mp->m_hash);
 		kfree(mp);
 	}
 }
 static inline int check_mnt(struct mount *mnt)
 {
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 /*
  * vfsmount lock must be held for write
  */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns) {
 		ns->event = ++event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns && ns->event != event) {
 		ns->event = event;
 		wake_up_interruptible(&ns->poll);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void detach_mnt(struct mount *mnt, struct path *old_path)
 {
 	old_path->dentry = mnt->mnt_mountpoint;
 	old_path->mnt = &mnt->mnt_parent->mnt;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
 	hlist_del_init_rcu(&mnt->mnt_hash);
 	hlist_del_init(&mnt->mnt_mp_list);
 	put_mountpoint(mnt->mnt_mp);
 	mnt->mnt_mp = NULL;
 }
 /*
  * vfsmount lock must be held for write
  */
 void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
 	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
 	child_mnt->mnt_parent = mnt;
 	child_mnt->mnt_mp = mp;
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 /*
  * vfsmount lock must be held for write
  */
 static void attach_mnt(struct mount *mnt,
 			struct mount *parent,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
 	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 static void attach_shadowed(struct mount *mnt,
 			struct mount *parent,
 			struct mount *shadows)
 {
 	if (shadows) {
 		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
 		list_add(&mnt->mnt_child, &shadows->mnt_child);
 	} else {
 		hlist_add_head_rcu(&mnt->mnt_hash,
 				m_hash(&parent->mnt, mnt->mnt_mountpoint));
 		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 	}
 }
 /*
  * vfsmount lock must be held for write
  */
 static void commit_tree(struct mount *mnt, struct mount *shadows)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
 	LIST_HEAD(head);
 	struct mnt_namespace *n = parent->mnt_ns;
 	BUG_ON(parent == mnt);
 	list_add_tail(&head, &mnt->mnt_list);
 	list_for_each_entry(m, &head, mnt_list)
 		m->mnt_ns = n;
 	list_splice(&head, n->list.prev);
 	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
 static struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
 	if (next == &p->mnt_mounts) {
 		while (1) {
 			if (p == root)
 				return NULL;
 			next = p->mnt_child.next;
 			if (next != &p->mnt_parent->mnt_mounts)
 				break;
 			p = p->mnt_parent;
 		}
 	}
 	return list_entry(next, struct mount, mnt_child);
 }
 static struct mount *skip_mnt_tree(struct mount *p)
 {
 	struct list_head *prev = p->mnt_mounts.prev;
 	while (prev != &p->mnt_mounts) {
 		p = list_entry(prev, struct mount, mnt_child);
 		prev = p->mnt_mounts.prev;
 	}
 	return p;
 }
 struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
 	struct mount *mnt;
 	struct dentry *root;
 	if (!type)
 		return ERR_PTR(-ENODEV);
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flags & MS_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 	root = mount_fs(type, flags, name, data);
 	if (IS_ERR(root)) {
 		mnt_free_id(mnt);
 		free_vfsmnt(mnt);
 		return ERR_CAST(root);
 	}
 	mnt->mnt.mnt_root = root;
 	mnt->mnt.mnt_sb = root->d_sb;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
 	unlock_mount_hash();
 	return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
 	struct super_block *sb = old->mnt.mnt_sb;
 	struct mount *mnt;
 	int err;
 	mnt = alloc_vfsmnt(old->mnt_devname);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
 	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
 		err = mnt_alloc_group_id(mnt);
 		if (err)
 			goto out_free;
 	}
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
 	/* Don't allow unprivileged users to change mount flags */
 	if (flag & CL_UNPRIVILEGED) {
 		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
 		if (mnt->mnt.mnt_flags & MNT_READONLY)
 			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
 		if (mnt->mnt.mnt_flags & MNT_NODEV)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
 		if (mnt->mnt.mnt_flags & MNT_NOSUID)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
 		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
 			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
 	}
 	/* Don't allow unprivileged users to reveal what is under a mount */
 	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
 		mnt->mnt.mnt_flags |= MNT_LOCKED;
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	mnt->mnt_parent = mnt;
 	lock_mount_hash();
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	unlock_mount_hash();
 	if ((flag & CL_SLAVE) ||
 	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 		CLEAR_MNT_SHARED(mnt);
 	} else if (!(flag & CL_PRIVATE)) {
 		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
 			list_add(&mnt->mnt_share, &old->mnt_share);
 		if (IS_MNT_SLAVE(old))
 			list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
 	}
 	if (flag & CL_MAKE_SHARED)
 		set_mnt_shared(mnt);
 	/* stick the duplicate mount on the same expiry list
 	 * as the original if that was on one */
 	if (flag & CL_EXPIRE) {
 		if (!list_empty(&old->mnt_expire))
 			list_add(&mnt->mnt_expire, &old->mnt_expire);
 	}
 	return mnt;
  out_free:
 	mnt_free_id(mnt);
 	free_vfsmnt(mnt);
 	return ERR_PTR(err);
 }
 static void cleanup_mnt(struct mount *mnt)
 {
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
 	/*
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
 	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 static void __cleanup_mnt(struct rcu_head *head)
 {
 	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
 }
 static LLIST_HEAD(delayed_mntput_list);
 static void delayed_mntput(struct work_struct *unused)
 {
 	struct llist_node *node = llist_del_all(&delayed_mntput_list);
 	struct llist_node *next;
 	for (; node; node = next) {
 		next = llist_next(node);
 		cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
 	}
 }
 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 static void mntput_no_expire(struct mount *mnt)
 {
 	rcu_read_lock();
 	mnt_add_count(mnt, -1);
 	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
 		rcu_read_unlock();
 		return;
 	}
 	lock_mount_hash();
 	if (mnt_get_count(mnt)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
 		return;
 	}
 	mnt->mnt.mnt_flags |= MNT_DOOMED;
 	rcu_read_unlock();
 	list_del(&mnt->mnt_instance);
 	unlock_mount_hash();
 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
 		struct task_struct *task = current;
 		if (likely(!(task->flags & PF_KTHREAD))) {
 			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
 			if (!task_work_add(task, &mnt->mnt_rcu, true))
 				return;
 		}
 		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
 			schedule_delayed_work(&delayed_mntput_work, 1);
 		return;
 	}
 	cleanup_mnt(mnt);
 }
 void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
 		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 		if (unlikely(m->mnt_expiry_mark))
 			m->mnt_expiry_mark = 0;
 		mntput_no_expire(m);
 	}
 }
 EXPORT_SYMBOL(mntput);
 struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
 		mnt_add_count(real_mount(mnt), 1);
 	return mnt;
 }
 EXPORT_SYMBOL(mntget);
 struct vfsmount *mnt_clone_internal(struct path *path)
 {
 	struct mount *p;
 	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
 	if (IS_ERR(p))
 		return ERR_CAST(p);
 	p->mnt.mnt_flags |= MNT_INTERNAL;
 	return &p->mnt;
 }
 static inline void mangle(struct seq_file *m, const char *s)
 {
 	seq_escape(m, s, " \t\n\\");
 }
 /*
  * Simple .show_options callback for filesystems which don't want to
  * implement more complex mount option showing.
  *
  * See also save_mount_options().
  */
 int generic_show_options(struct seq_file *m, struct dentry *root)
 {
 	const char *options;
 	rcu_read_lock();
 	options = rcu_dereference(root->d_sb->s_options);
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
 		mangle(m, options);
 	}
 	rcu_read_unlock();
 	return 0;
 }
 EXPORT_SYMBOL(generic_show_options);
 /*
  * If filesystem uses generic_show_options(), this function should be
  * called from the fill_super() callback.
  *
  * The .remount_fs callback usually needs to be handled in a special
  * way, to make sure, that previous options are not overwritten if the
  * remount fails.
  *
  * Also note, that if the filesystem's .remount_fs function doesn't
  * reset all options to their default value, but changes only newly
  * given options, then the displayed options will not reflect reality
  * any more.
  */
 void save_mount_options(struct super_block *sb, char *options)
 {
 	BUG_ON(sb->s_options);
 	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
 }
 EXPORT_SYMBOL(save_mount_options);
 void replace_mount_options(struct super_block *sb, char *options)
 {
 	char *old = sb->s_options;
 	rcu_assign_pointer(sb->s_options, options);
 	if (old) {
 		synchronize_rcu();
 		kfree(old);
 	}
 }
 EXPORT_SYMBOL(replace_mount_options);
 #ifdef CONFIG_PROC_FS
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	down_read(&namespace_sem);
 	if (p->cached_event == p->ns->event) {
 		void *v = p->cached_mount;
 		if (*pos == p->cached_index)
 			return v;
 		if (*pos == p->cached_index + 1) {
 			v = seq_list_next(v, &p->ns->list, &p->cached_index);
 			return p->cached_mount = v;
 		}
 	}
 	p->cached_event = p->ns->event;
 	p->cached_mount = seq_list_start(&p->ns->list, *pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
 	p->cached_index = *pos;
 	return p->cached_mount;
 }
 static void m_stop(struct seq_file *m, void *v)
 {
 	up_read(&namespace_sem);
 }
 static int m_show(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = proc_mounts(m);
 	struct mount *r = list_entry(v, struct mount, mnt_list);
 	return p->show(m, &r->mnt);
 }
 const struct seq_operations mounts_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= m_show,
 };
 #endif  /* CONFIG_PROC_FS */
 /**
  * may_umount_tree - check if a mount tree is busy
  * @mnt: root of mount tree
  *
  * This is called to check if a tree of mounts has any
  * open files, pwds, chroots or sub mounts that are
  * busy.
  */
 int may_umount_tree(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int actual_refs = 0;
 	int minimum_refs = 0;
 	struct mount *p;
 	BUG_ON(!m);
 	/* write lock needed for mnt_get_count */
 	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
 	unlock_mount_hash();
 	if (actual_refs > minimum_refs)
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL(may_umount_tree);
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
  *
  * This is called to check if a mount point has any
  * open files, pwds, chroots or sub mounts. If the
  * mount has sub mounts this will return busy
  * regardless of whether the sub mounts are busy.
  *
  * Doesn't take quota and stuff into account. IOW, in some cases it will
  * give false negatives. The main reason why it's here is that we need
  * a non-destructive way to look for easily umountable filesystems.
  */
 int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
 	lock_mount_hash();
 	if (propagate_mount_busy(real_mount(mnt), 2))
 		ret = 0;
 	unlock_mount_hash();
 	up_read(&namespace_sem);
 	return ret;
 }
 EXPORT_SYMBOL(may_umount);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
 	struct mount *mnt;
 	struct hlist_head head = unmounted;
 	if (likely(hlist_empty(&head))) {
 		up_write(&namespace_sem);
 		return;
 	}
 	head.first->pprev = &head.first;
 	INIT_HLIST_HEAD(&unmounted);
 	/* undo decrements we'd done in umount_tree() */
 	hlist_for_each_entry(mnt, &head, mnt_hash)
 		if (mnt->mnt_ex_mountpoint.mnt)
 			mntget(mnt->mnt_ex_mountpoint.mnt);
 	up_write(&namespace_sem);
 	synchronize_rcu();
 	while (!hlist_empty(&head)) {
 		mnt = hlist_entry(head.first, struct mount, mnt_hash);
 		hlist_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_ex_mountpoint.mnt)
 			path_put(&mnt->mnt_ex_mountpoint);
 		mntput(&mnt->mnt);
 	}
 }
 static inline void namespace_lock(void)
 {
 	down_write(&namespace_sem);
 }
 /*
  * mount_lock must be held
  * namespace_sem must be held for write
  * how = 0 => just this tree, don't propagate
  * how = 1 => propagate; we know that nobody else has reference to any victims
  * how = 2 => lazy umount
  */
 void umount_tree(struct mount *mnt, int how)
 {
 	HLIST_HEAD(tmp_list);
 	struct mount *p;
 	struct mount *last = NULL;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		hlist_del_init_rcu(&p->mnt_hash);
 		hlist_add_head(&p->mnt_hash, &tmp_list);
 	}
 	hlist_for_each_entry(p, &tmp_list, mnt_hash)
 		list_del_init(&p->mnt_child);
 	if (how)
 		propagate_umount(&tmp_list);
 	hlist_for_each_entry(p, &tmp_list, mnt_hash) {
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
 		if (how < 2)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
 		if (mnt_has_parent(p)) {
 			hlist_del_init(&p->mnt_mp_list);
 			put_mountpoint(p->mnt_mp);
 			mnt_add_count(p->mnt_parent, -1);
 			/* move the reference to mountpoint into ->mnt_ex_mountpoint */
 			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
 			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
 			p->mnt_mountpoint = p->mnt.mnt_root;
 			p->mnt_parent = p;
 			p->mnt_mp = NULL;
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 		last = p;
 	}
 	if (last) {
 		last->mnt_hash.next = unmounted.first;
 		unmounted.first = tmp_list.first;
 		unmounted.first->pprev = &unmounted.first;
 	}
 }
 static void shrink_submounts(struct mount *mnt);
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
 	int retval;
 	retval = security_sb_umount(&mnt->mnt, flags);
 	if (retval)
 		return retval;
 	/*
 	 * Allow userspace to request a mountpoint be expired rather than
 	 * unmounting unconditionally. Unmount only happens if:
 	 *  (1) the mark is already set (the mark is cleared by mntput())
 	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
 	 */
 	if (flags & MNT_EXPIRE) {
 		if (&mnt->mnt == current->fs->root.mnt ||
 		    flags & (MNT_FORCE | MNT_DETACH))
 			return -EINVAL;
 		/*
 		 * probably don't strictly need the lock here if we examined
 		 * all race cases, but it's a slowpath.
 		 */
 		lock_mount_hash();
 		if (mnt_get_count(mnt) != 2) {
 			unlock_mount_hash();
 			return -EBUSY;
 		}
 		unlock_mount_hash();
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
 	}
 	/*
 	 * If we may have to abort operations to get out of this
 	 * mount, and they will themselves hold resources we must
 	 * allow the fs to do things. In the Unix tradition of
 	 * 'Gee thats tricky lets do it in userspace' the umount_begin
 	 * might fail to complete on the first run through as other tasks
 	 * must return, and the like. Thats for the mount program to worry
 	 * about for the moment.
 	 */
 	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
 		sb->s_op->umount_begin(sb);
 	}
 	/*
 	 * No sense to grab the lock for this test, but test itself looks
 	 * somewhat bogus. Suggestions for better replacement?
 	 * Ho-hum... In principle, we might treat that as umount + switch
 	 * to rootfs. GC would eventually take care of the old vfsmount.
 	 * Actually it makes sense, especially if rootfs would contain a
 	 * /reboot - static binary that would close all descriptors and
 	 * call reboot(9). Then init(8) could umount root and exec /reboot.
 	 */
 	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
 		/*
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
 		up_write(&sb->s_umount);
 		return retval;
 	}
 	namespace_lock();
 	lock_mount_hash();
 	event++;
 	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt, 2);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
 			if (!list_empty(&mnt->mnt_list))
 				umount_tree(mnt, 1);
 			retval = 0;
 		}
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 	return retval;
 }
 /*
  * __detach_mounts - lazily unmount all mounts on the specified dentry
  *
  * During unlink, rmdir, and d_drop it is possible to loose the path
  * to an existing mountpoint, and wind up leaking the mount.
  * detach_mounts allows lazily unmounting those mounts instead of
  * leaking them.
  *
  * The caller may hold dentry->d_inode->i_mutex.
  */
 void __detach_mounts(struct dentry *dentry)
 {
 	struct mountpoint *mp;
 	struct mount *mnt;
 	namespace_lock();
 	mp = lookup_mountpoint(dentry);
 	if (!mp)
 		goto out_unlock;
 	lock_mount_hash();
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		umount_tree(mnt, 2);
 	}
 	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
 	namespace_unlock();
 }
 /*
  * Is the caller allowed to modify his namespace?
  */
 static inline bool may_mount(void)
 {
 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
 }
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
  *
  * We now support a flag for forced unmount like the other 'big iron'
  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  */
 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	struct mount *mnt;
 	int retval;
 	int lookup_flags = 0;
 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
 		return -EINVAL;
 	if (!may_mount())
 		return -EPERM;
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
 	retval = -EINVAL;
 	if (path.dentry != path.mnt->mnt_root)
 		goto dput_and_out;
 	if (!check_mnt(mnt))
 		goto dput_and_out;
 	if (mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto dput_and_out;
 	retval = do_umount(mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
 	dput(path.dentry);
 	mntput_no_expire(mnt);
 out:
 	return retval;
 }
 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
 /*
  *	The 2.0 compatible umount. No flags.
  */
 SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
 	return sys_umount(name, 0);
 }
 #endif
 static bool is_mnt_ns_file(struct dentry *dentry)
 {
 	/* Is this a proxy for a mount namespace? */
-	struct inode *inode = dentry->d_inode;
+	return dentry->d_op == &ns_dentry_operations &&
-	return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations;
+	       dentry->d_fsdata == &mntns_operations;
 }
 struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct mnt_namespace, ns);
 }
 static bool mnt_ns_loop(struct dentry *dentry)
 {
 	/* Could bind mounting the mount namespace inode cause a
 	 * mount namespace loop?
 	 */
 	struct mnt_namespace *mnt_ns;
 	if (!is_mnt_ns_file(dentry))
 		return false;
 	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
 	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
 }
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct mount *res, *p, *q, *r, *parent;
 	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
 		return ERR_PTR(-EINVAL);
 	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
 		return ERR_PTR(-EINVAL);
 	res = q = clone_mnt(mnt, dentry, flag);
 	if (IS_ERR(q))
 		return q;
 	q->mnt.mnt_flags &= ~MNT_LOCKED;
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 	p = mnt;
 	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
 		struct mount *s;
 		if (!is_subdir(r->mnt_mountpoint, dentry))
 			continue;
 		for (s = r; s; s = next_mnt(s, r)) {
 			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			if (!(flag & CL_COPY_MNT_NS_FILE) &&
 			    is_mnt_ns_file(s->mnt.mnt_root)) {
 				s = skip_mnt_tree(s);
 				continue;
 			}
 			while (p != s->mnt_parent) {
 				p = p->mnt_parent;
 				q = q->mnt_parent;
 			}
 			p = s;
 			parent = q;
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (IS_ERR(q))
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			mnt_set_mountpoint(parent, p->mnt_mp, q);
 			if (!list_empty(&parent->mnt_mounts)) {
 				t = list_last_entry(&parent->mnt_mounts,
 					struct mount, mnt_child);
 				if (t->mnt_mp != p->mnt_mp)
 					t = NULL;
 			}
 			attach_shadowed(q, parent, t);
 			unlock_mount_hash();
 		}
 	}
 	return res;
 out:
 	if (res) {
 		lock_mount_hash();
 		umount_tree(res, 0);
 		unlock_mount_hash();
 	}
 	return q;
 }
 /* Caller should check returned pointer for errors */
 struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
 	namespace_lock();
 	tree = copy_tree(real_mount(path->mnt), path->dentry,
 			 CL_COPY_ALL | CL_PRIVATE);
 	namespace_unlock();
 	if (IS_ERR(tree))
 		return ERR_CAST(tree);
 	return &tree->mnt;
 }
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
 	umount_tree(real_mount(mnt), 0);
 	unlock_mount_hash();
 	namespace_unlock();
 }
 /**
  * clone_private_mount - create a private clone of a path
  *
  * This creates a new vfsmount, which will be the clone of @path.  The new will
  * not be attached anywhere in the namespace and will be private (i.e. changes
  * to the originating mount won't be propagated into this).
  *
  * Release with mntput().
  */
 struct vfsmount *clone_private_mount(struct path *path)
 {
 	struct mount *old_mnt = real_mount(path->mnt);
 	struct mount *new_mnt;
 	if (IS_MNT_UNBINDABLE(old_mnt))
 		return ERR_PTR(-EINVAL);
 	down_read(&namespace_sem);
 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
 	up_read(&namespace_sem);
 	if (IS_ERR(new_mnt))
 		return ERR_CAST(new_mnt);
 	return &new_mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(clone_private_mount);
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 		   struct vfsmount *root)
 {
 	struct mount *mnt;
 	int res = f(root, arg);
 	if (res)
 		return res;
 	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
 		res = f(&mnt->mnt, arg);
 		if (res)
 			return res;
 	}
 	return 0;
 }
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
 	struct mount *p;
 	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
 		if (p->mnt_group_id && !IS_MNT_SHARED(p))
 			mnt_release_group_id(p);
 	}
 }
 static int invent_group_ids(struct mount *mnt, bool recurse)
 {
 	struct mount *p;
 	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
 		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
 			int err = mnt_alloc_group_id(p);
 			if (err) {
 				cleanup_group_ids(mnt, p);
 				return err;
 			}
 		}
 	}
 	return 0;
 }
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
  *  		   store the parent mount and mountpoint dentry.
  *  		   (done when source_mnt is moved)
  *
  *  NOTE: in the table below explains the semantics when a source mount
  *  of a given type is attached to a destination mount of a given type.
  * ---------------------------------------------------------------------------
  * |         BIND MOUNT OPERATION                                            |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
  * ***************************************************************************
  * A bind operation clones the source mount and mounts the clone on the
  * destination mount.
  *
  * (++)  the cloned mount is propagated to all the mounts in the propagation
  * 	 tree of the destination mount and the cloned mount is added to
  * 	 the peer group of the source mount.
  * (+)   the cloned mount is created under the destination mount and is marked
  *       as shared. The cloned mount is added to the peer group of the source
  *       mount.
  * (+++) the mount is propagated to all the mounts in the propagation tree
  *       of the destination mount and the cloned mount is made slave
  *       of the same master as that of the source mount. The cloned mount
  *       is marked as 'shared and slave'.
  * (*)   the cloned mount is made a slave of the same master as that of the
  * 	 source mount.
  *
  * ---------------------------------------------------------------------------
  * |         		MOVE MOUNT OPERATION                                 |
  * |**************************************************************************
  * | source-->| shared        |       private  |       slave    | unbindable |
  * | dest     |               |                |                |            |
  * |   |      |               |                |                |            |
  * |   v      |               |                |                |            |
  * |**************************************************************************
  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
  * |          |               |                |                |            |
  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
  * ***************************************************************************
  *
  * (+)  the mount is moved to the destination. And is then propagated to
  * 	all the mounts in the propagation tree of the destination mount.
  * (+*)  the mount is moved to the destination.
  * (+++)  the mount is moved to the destination and is then propagated to
  * 	all the mounts belonging to the destination mount's propagation tree.
  * 	the mount is marked as 'shared and slave'.
  * (*)	the mount continues to be a slave at the new location.
  *
  * if the source mount is a tree, the operations explained above is
  * applied to each mount in the tree.
  * Must be called without spinlocks held, since this function can sleep
  * in allocations.
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
 			struct mount *dest_mnt,
 			struct mountpoint *dest_mp,
 			struct path *parent_path)
 {
 	HLIST_HEAD(tree_list);
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 	if (IS_MNT_SHARED(dest_mnt)) {
 		err = invent_group_ids(source_mnt, true);
 		if (err)
 			goto out;
 		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
 		lock_mount_hash();
 		if (err)
 			goto out_cleanup_ids;
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
 			set_mnt_shared(p);
 	} else {
 		lock_mount_hash();
 	}
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt, NULL);
 	}
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
 		q = __lookup_mnt_last(&child->mnt_parent->mnt,
 				      child->mnt_mountpoint);
 		commit_tree(child, q);
 	}
 	unlock_mount_hash();
 	return 0;
  out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
 		umount_tree(child, 0);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	return err;
 }
 static struct mountpoint *lock_mount(struct path *path)
 {
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
 retry:
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (unlikely(cant_mount(dentry))) {
 		mutex_unlock(&dentry->d_inode->i_mutex);
 		return ERR_PTR(-ENOENT);
 	}
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
 		struct mountpoint *mp = lookup_mountpoint(dentry);
 		if (!mp)
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			return mp;
 		}
 		return mp;
 	}
 	namespace_unlock();
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	path_put(path);
 	path->mnt = mnt;
 	dentry = path->dentry = dget(mnt->mnt_root);
 	goto retry;
 }
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
 	put_mountpoint(where);
 	namespace_unlock();
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 {
 	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
 	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 	return attach_recursive_mnt(mnt, p, mp, NULL);
 }
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
 static int flags_to_propagation_type(int flags)
 {
 	int type = flags & ~(MS_REC | MS_SILENT);
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		return 0;
 	/* Only one propagation flag should be set */
 	if (!is_power_of_2(type))
 		return 0;
 	return type;
 }
 /*
  * recursively change the type of the mountpoint.
  */
 static int do_change_type(struct path *path, int flag)
 {
 	struct mount *m;
 	struct mount *mnt = real_mount(path->mnt);
 	int recurse = flag & MS_REC;
 	int type;
 	int err = 0;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	type = flags_to_propagation_type(flag);
 	if (!type)
 		return -EINVAL;
 	namespace_lock();
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
 			goto out_unlock;
 	}
 	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
 	unlock_mount_hash();
  out_unlock:
 	namespace_unlock();
 	return err;
 }
 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 {
 	struct mount *child;
 	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 		if (!is_subdir(child->mnt_mountpoint, dentry))
 			continue;
 		if (child->mnt.mnt_flags & MNT_LOCKED)
 			return true;
 	}
 	return false;
 }
 /*
  * do loopback mount.
  */
 static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
 	struct mount *mnt = NULL, *old, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
 	if (err)
 		return err;
 	err = -EINVAL;
 	if (mnt_ns_loop(old_path.dentry))
 		goto out;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old))
 		goto out2;
-	if (!check_mnt(parent) || !check_mnt(old))
+	if (!check_mnt(parent))
+		goto out2;
+	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
 		goto out2;
 	if (!recurse && has_locked_children(old, old_path.dentry))
 		goto out2;
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
 	else
 		mnt = clone_mnt(old, old_path.dentry, 0);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
 		umount_tree(mnt, 0);
 		unlock_mount_hash();
 	}
 out2:
 	unlock_mount(mp);
 out:
 	path_put(&old_path);
 	return err;
 }
 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 {
 	int error = 0;
 	int readonly_request = 0;
 	if (ms_flags & MS_RDONLY)
 		readonly_request = 1;
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
 		__mnt_unmake_readonly(real_mount(mnt));
 	return error;
 }
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
 static int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
 	if (!check_mnt(mnt))
 		return -EINVAL;
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 	/* Don't allow changing of locked mnt flags.
 	 *
 	 * No locks need to be held here while testing the various
 	 * MNT_LOCK flags because those flags can never be cleared
 	 * once they are set.
 	 */
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
 	    !(mnt_flags & MNT_READONLY)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 	    !(mnt_flags & MNT_NODEV)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 	    !(mnt_flags & MNT_NOSUID)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
 	    !(mnt_flags & MNT_NOEXEC)) {
 		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
 	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
 		return -EPERM;
 	}
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
 		unlock_mount_hash();
 	}
 	up_write(&sb->s_umount);
 	return err;
 }
 static inline int tree_contains_unbindable(struct mount *mnt)
 {
 	struct mount *p;
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		if (IS_MNT_UNBINDABLE(p))
 			return 1;
 	}
 	return 0;
 }
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
 	struct mount *p;
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 	mp = lock_mount(path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 	old = real_mount(old_path.mnt);
 	p = real_mount(path->mnt);
 	err = -EINVAL;
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 	if (old->mnt.mnt_flags & MNT_LOCKED)
 		goto out1;
 	err = -EINVAL;
 	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
 	if (!mnt_has_parent(old))
 		goto out1;
 	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
 	      S_ISDIR(old_path.dentry->d_inode->i_mode))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
 	if (IS_MNT_SHARED(old->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
 		goto out1;
 	err = -ELOOP;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
 			goto out1;
 	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
 	if (err)
 		goto out1;
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
 out1:
 	unlock_mount(mp);
 out:
 	if (!err)
 		path_put(&parent_path);
 	path_put(&old_path);
 	return err;
 }
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
 	int err;
 	const char *subtype = strchr(fstype, '.');
 	if (subtype) {
 		subtype++;
 		err = -EINVAL;
 		if (!subtype[0])
 			goto err;
 	} else
 		subtype = "";
 	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
 	err = -ENOMEM;
 	if (!mnt->mnt_sb->s_subtype)
 		goto err;
 	return mnt;
  err:
 	mntput(mnt);
 	return ERR_PTR(err);
 }
 /*
  * add a mount into a namespace's mount tree
  */
 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 {
 	struct mountpoint *mp;
 	struct mount *parent;
 	int err;
 	mnt_flags &= ~MNT_INTERNAL_FLAGS;
 	mp = lock_mount(path);
 	if (IS_ERR(mp))
 		return PTR_ERR(mp);
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
 	if (unlikely(!check_mnt(parent))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
 			goto unlock;
 		/* ... and for those we'd better have mountpoint still alive */
 		if (!parent->mnt_ns)
 			goto unlock;
 	}
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
 	    path->mnt->mnt_root == path->dentry)
 		goto unlock;
 	err = -EINVAL;
 	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
 		goto unlock;
 	newmnt->mnt.mnt_flags = mnt_flags;
 	err = graft_tree(newmnt, parent, mp);
 unlock:
 	unlock_mount(mp);
 	return err;
 }
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
 static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 	if (!fstype)
 		return -EINVAL;
 	type = get_fs_type(fstype);
 	if (!type)
 		return -ENODEV;
 	if (user_ns != &init_user_ns) {
 		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
 			put_filesystem(type);
 			return -EPERM;
 		}
 		/* Only in special cases allow devices from mounts
 		 * created outside the initial user namespace.
 		 */
 		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
 	}
 	mnt = vfs_kern_mount(type, flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
 		mnt = fs_set_subtype(mnt, fstype);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
 	return err;
 }
 int finish_automount(struct vfsmount *m, struct path *path)
 {
 	struct mount *mnt = real_mount(m);
 	int err;
 	/* The new mount record should have at least 2 refs to prevent it being
 	 * expired before we get a chance to add it
 	 */
 	BUG_ON(mnt_get_count(mnt) < 2);
 	if (m->mnt_sb == path->mnt->mnt_sb &&
 	    m->mnt_root == path->dentry) {
 		err = -ELOOP;
 		goto fail;
 	}
 	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	if (!err)
 		return 0;
 fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();
 		list_del_init(&mnt->mnt_expire);
 		namespace_unlock();
 	}
 	mntput(m);
 	mntput(m);
 	return err;
 }
 /**
  * mnt_set_expiry - Put a mount on an expiration list
  * @mnt: The mount to list.
  * @expiry_list: The list to add the mount to.
  */
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
 	namespace_lock();
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 	namespace_unlock();
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
  * here
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
 	struct mount *mnt, *next;
 	LIST_HEAD(graveyard);
 	if (list_empty(mounts))
 		return;
 	namespace_lock();
 	lock_mount_hash();
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
 	 * - only referenced by its parent vfsmount
 	 * - still marked for expiry (marked on the last call here; marks are
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
 			propagate_mount_busy(mnt, 1))
 			continue;
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1);
 	}
 	unlock_mount_hash();
 	namespace_unlock();
 }
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
 /*
  * Ripoff of 'select_parent()'
  *
  * search the list of submounts for a given mountpoint, and move any
  * shrinkable submounts to the 'graveyard' list.
  */
 static int select_submounts(struct mount *parent, struct list_head *graveyard)
 {
 	struct mount *this_parent = parent;
 	struct list_head *next;
 	int found = 0;
 repeat:
 	next = this_parent->mnt_mounts.next;
 resume:
 	while (next != &this_parent->mnt_mounts) {
 		struct list_head *tmp = next;
 		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
 		next = tmp->next;
 		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
 			continue;
 		/*
 		 * Descend a level if the d_mounts list is non-empty.
 		 */
 		if (!list_empty(&mnt->mnt_mounts)) {
 			this_parent = mnt;
 			goto repeat;
 		}
 		if (!propagate_mount_busy(mnt, 1)) {
 			list_move_tail(&mnt->mnt_expire, graveyard);
 			found++;
 		}
 	}
 	/*
 	 * All done at this level ... ascend and resume the search
 	 */
 	if (this_parent != parent) {
 		next = this_parent->mnt_child.next;
 		this_parent = this_parent->mnt_parent;
 		goto resume;
 	}
 	return found;
 }
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  *
  * mount_lock must be held for write
  */
 static void shrink_submounts(struct mount *mnt)
 {
 	LIST_HEAD(graveyard);
 	struct mount *m;
 	/* extract submounts of 'mountpoint' from the expiration list */
 	while (select_submounts(mnt, &graveyard)) {
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct mount,
 						mnt_expire);
 			touch_mnt_namespace(m->mnt_ns);
 			umount_tree(m, 1);
 		}
 	}
 }
 /*
  * Some copy_from_user() implementations do not return the exact number of
  * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
  * Note that this function differs from copy_from_user() in that it will oops
  * on bad values of `to', rather than returning a short copy.
  */
 static long exact_copy_from_user(void *to, const void __user * from,
 				 unsigned long n)
 {
 	char *t = to;
 	const char __user *f = from;
 	char c;
 	if (!access_ok(VERIFY_READ, from, n))
 		return n;
 	while (n) {
 		if (__get_user(c, f)) {
 			memset(t, 0, n);
 			break;
 		}
 		*t++ = c;
 		f++;
 		n--;
 	}
 	return n;
 }
 int copy_mount_options(const void __user * data, unsigned long *where)
 {
 	int i;
 	unsigned long page;
 	unsigned long size;
 	*where = 0;
 	if (!data)
 		return 0;
 	if (!(page = __get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 	/* We only care that *some* data at the address the user
 	 * gave us is valid.  Just in case, we'll zero
 	 * the remainder of the page.
 	 */
 	/* copy_from_user cannot cross TASK_SIZE ! */
 	size = TASK_SIZE - (unsigned long)data;
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 	i = size - exact_copy_from_user((void *)page, data, size);
 	if (!i) {
 		free_page(page);
 		return -EFAULT;
 	}
 	if (i != PAGE_SIZE)
 		memset((char *)page + i, 0, PAGE_SIZE - i);
 	*where = page;
 	return 0;
 }
 char *copy_mount_string(const void __user *data)
 {
 	return data ? strndup_user(data, PAGE_SIZE) : NULL;
 }
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  *
  * data is a (void *) that can point to any structure up to
  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  * information (or be NULL).
  *
  * Pre-0.97 versions of mount() didn't have a flags word.
  * When the flags word was introduced its top half was required
  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  * Therefore, if this magic number is present, it carries no information
  * and must be discarded.
  */
 long do_mount(const char *dev_name, const char __user *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
 	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
 		flags &= ~MS_MGC_MSK;
 	/* Basic sanity checks */
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 	/* ... and get the mountpoint */
 	retval = user_path(dir_name, &path);
 	if (retval)
 		return retval;
 	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
 	if (!retval && !may_mount())
 		retval = -EPERM;
 	if (retval)
 		goto dput_out;
 	/* Default to relatime unless overriden */
 	if (!(flags & MS_NOATIME))
 		mnt_flags |= MNT_RELATIME;
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
 	if (flags & MS_NODEV)
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
 	if (flags & MS_NOATIME)
 		mnt_flags |= MNT_NOATIME;
 	if (flags & MS_NODIRATIME)
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_STRICTATIME)
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 	/* The default atime for remount is preservation */
 	if ((flags & MS_REMOUNT) &&
 	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
 		       MS_STRICTATIME)) == 0)) {
 		mnt_flags &= ~MNT_ATIME_MASK;
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
 	path_put(&path);
 	return retval;
 }
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	ns_free_inum(&ns->ns);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
 /*
  * Assign a sequence number so we can detect when we attempt to bind
  * mount a reference to an older mount namespace into the current
  * mount namespace, preventing reference counting loops.  A 64bit
  * number incrementing at 10Ghz will take 12,427 years to wrap which
  * is effectively never, so we can ignore the possibility.
  */
 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
 	int ret;
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
 	ret = ns_alloc_inum(&new_ns->ns);
 	if (ret) {
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
 	new_ns->ns.ops = &mntns_operations;
 	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
 	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
 	struct mount *old;
 	struct mount *new;
 	int copy_flags;
 	BUG_ON(!ns);
 	if (likely(!(flags & CLONE_NEWNS))) {
 		get_mnt_ns(ns);
 		return ns;
 	}
 	old = ns->root;
 	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
 	namespace_lock();
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
 		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
 	list_add_tail(&new_ns->list, &new->mnt_list);
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
 	 * as belonging to new namespace.  We have already acquired a private
 	 * fs_struct, so tsk->fs->lock is not needed.
 	 */
 	p = old;
 	q = new;
 	while (p) {
 		q->mnt_ns = new_ns;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
 				rootmnt = &p->mnt;
 			}
 			if (&p->mnt == new_fs->pwd.mnt) {
 				new_fs->pwd.mnt = mntget(&q->mnt);
 				pwdmnt = &p->mnt;
 			}
 		}
 		p = next_mnt(p, old);
 		q = next_mnt(q, new);
 		if (!q)
 			break;
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(p, old);
 	}
 	namespace_unlock();
 	if (rootmnt)
 		mntput(rootmnt);
 	if (pwdmnt)
 		mntput(pwdmnt);
 	return new_ns;
 }
 /**
  * create_mnt_ns - creates a private namespace and adds a root filesystem
  * @mnt: pointer to the new root filesystem mountpoint
  */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
 	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
 		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
 	}
 	return new_ns;
 }
 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
 {
 	struct mnt_namespace *ns;
 	struct super_block *s;
 	struct path path;
 	int err;
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		return ERR_CAST(ns);
 	err = vfs_path_lookup(mnt->mnt_root, mnt,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
 	put_mnt_ns(ns);
 	if (err)
 		return ERR_PTR(err);
 	/* trade a vfsmount reference for active sb one */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
 	mntput(path.mnt);
 	/* lock the sucker */
 	down_write(&s->s_umount);
 	/* ... and return the root of (sub)tree on it */
 	return path.dentry;
 }
 EXPORT_SYMBOL(mount_subtree);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 		char __user *, type, unsigned long, flags, void __user *, data)
 {
 	int ret;
 	char *kernel_type;
 	char *kernel_dev;
 	unsigned long data_page;
 	kernel_type = copy_mount_string(type);
 	ret = PTR_ERR(kernel_type);
 	if (IS_ERR(kernel_type))
 		goto out_type;
 	kernel_dev = copy_mount_string(dev_name);
 	ret = PTR_ERR(kernel_dev);
 	if (IS_ERR(kernel_dev))
 		goto out_dev;
 	ret = copy_mount_options(data, &data_page);
 	if (ret < 0)
 		goto out_data;
 	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
 		(void *) data_page);
 	free_page(data_page);
 out_data:
 	kfree(kernel_dev);
 out_dev:
 	kfree(kernel_type);
 out_type:
 	return ret;
 }
 /*
  * Return true if path is reachable from root
  *
  * namespace_sem or mount_lock is held
  */
 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 			 const struct path *root)
 {
 	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
 		dentry = mnt->mnt_mountpoint;
 		mnt = mnt->mnt_parent;
 	}
 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
 }
 int path_is_under(struct path *path1, struct path *path2)
 {
 	int res;
 	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
 	read_sequnlock_excl(&mount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
  * makes new_root as the new root file system of the current process, and sets
  * root/cwd of all processes which had them on the current root to new_root.
  *
  * Restrictions:
  * The new_root and put_old must be directories, and  must not be on the
  * same file  system as the current process root. The put_old  must  be
  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
  * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
  * in this situation.
  *
  * Notes:
  *  - we don't move root/cwd if they are not at the root (reason: if something
  *    cared enough to change them, it's probably wrong to force them elsewhere)
  *  - it's okay to pick a root that isn't the root of a file system, e.g.
  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		const char __user *, put_old)
 {
 	struct path new, old, parent_path, root_parent, root;
 	struct mount *new_mnt, *root_mnt, *old_mnt;
 	struct mountpoint *old_mp, *root_mp;
 	int error;
 	if (!may_mount())
 		return -EPERM;
 	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
 	error = user_path_dir(put_old, &old);
 	if (error)
 		goto out1;
 	error = security_sb_pivotroot(&old, &new);
 	if (error)
 		goto out2;
 	get_fs_root(current->fs, &root);
 	old_mp = lock_mount(&old);
 	error = PTR_ERR(old_mp);
 	if (IS_ERR(old_mp))
 		goto out3;
 	error = -EINVAL;
 	new_mnt = real_mount(new.mnt);
 	root_mnt = real_mount(root.mnt);
 	old_mnt = real_mount(old.mnt);
 	if (IS_MNT_SHARED(old_mnt) ||
 		IS_MNT_SHARED(new_mnt->mnt_parent) ||
 		IS_MNT_SHARED(root_mnt->mnt_parent))
 		goto out4;
 	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
 		goto out4;
 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
 		goto out4;
 	error = -ENOENT;
 	if (d_unlinked(new.dentry))
 		goto out4;
 	error = -EBUSY;
 	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		goto out4; /* not attached */
 	root_mp = root_mnt->mnt_mp;
 	if (new.mnt->mnt_root != new.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
 	/* make certain new is below the root */
 	if (!is_path_reachable(new_mnt, new.dentry, &root))
 		goto out4;
 	root_mp->m_count++; /* pin it so it won't go away */
 	lock_mount_hash();
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
 		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
 		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
 	/* mount old root on put_old */
 	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
 	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
 	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
 	if (!error) {
 		path_put(&root_parent);
 		path_put(&parent_path);
 	}
 out3:
 	path_put(&root);
 out2:
 	path_put(&old);
 out1:
 	path_put(&new);
 out0:
 	return error;
 }
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
 	struct mnt_namespace *ns;
 	struct path root;
 	struct file_system_type *type;
 	type = get_fs_type("rootfs");
 	if (!type)
 		panic("Can't find rootfs type");
 	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 	root.mnt = mnt;
 	root.dentry = mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 }
 void __init mnt_init(void)
 {
 	unsigned u;
 	int err;
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
 				mhash_entries, 19,
 				0,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
 				sizeof(struct hlist_head),
 				mphash_entries, 19,
 				0,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 	for (u = 0; u <= m_hash_mask; u++)
 		INIT_HLIST_HEAD(&mount_hashtable[u]);
 	for (u = 0; u <= mp_hash_mask; u++)
 		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
 	kernfs_init();
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
 			__func__, err);
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
 	init_rootfs();
 	init_mount_tree();
 }
 void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	drop_collected_mounts(&ns->root->mnt);
 	free_mnt_ns(ns);
 }
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 {
 	struct vfsmount *mnt;
 	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
 		 * we unmount before file sys is unregistered
 		*/
 		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
 	}
 	return mnt;
 }
 EXPORT_SYMBOL_GPL(kern_mount_data);
 void kern_unmount(struct vfsmount *mnt)
 {
 	/* release long term mount so mount point can be released */
 	if (!IS_ERR_OR_NULL(mnt)) {
 		real_mount(mnt)->mnt_ns = NULL;
 		synchronize_rcu();	/* yecchhh... */
 		mntput(mnt);
 	}
 }
 EXPORT_SYMBOL(kern_unmount);
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
 }
 bool current_chrooted(void)
 {
 	/* Does the current process have a non-standard root */
 	struct path ns_root;
 	struct path fs_root;
 	bool chrooted;
 	/* Find the namespace root */
 	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
 	ns_root.dentry = ns_root.mnt->mnt_root;
 	path_get(&ns_root);
 	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
 		;
 	get_fs_root(current->fs, &fs_root);
 	chrooted = !path_equal(&fs_root, &ns_root);
 	path_put(&fs_root);
 	path_put(&ns_root);
 	return chrooted;
 }
 bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
 	bool visible = false;
 	if (unlikely(!ns))
 		return false;
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		if (mnt->mnt.mnt_sb->s_type != type)
 			continue;
 		/* This mount is not fully visible if there are any child mounts
 		 * that cover anything except for empty directories.
 		 */
 		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 			struct inode *inode = child->mnt_mountpoint->d_inode;
 			if (!S_ISDIR(inode->i_mode))
 				goto next;
 			if (inode->i_nlink > 2)
 				goto next;
 		}
 		visible = true;
 		goto found;
 	next:	;
 	}
 found:
 	up_read(&namespace_sem);
 	return visible;
 }
 static struct ns_common *mntns_get(struct task_struct *task)
 {
 	struct ns_common *ns = NULL;
 	struct nsproxy *nsproxy;
 	task_lock(task);
 	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = &nsproxy->mnt_ns->ns;
 		get_mnt_ns(to_mnt_ns(ns));
 	}
 	task_unlock(task);
 	return ns;
 }
 static void mntns_put(struct ns_common *ns)
 {
 	put_mnt_ns(to_mnt_ns(ns));
 }
 static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 {
 	struct fs_struct *fs = current->fs;
 	struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
 	struct path root;
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 	if (fs->users != 1)
 		return -EINVAL;
 	get_mnt_ns(mnt_ns);
 	put_mnt_ns(nsproxy->mnt_ns);
 	nsproxy->mnt_ns = mnt_ns;
 	/* Find the root */
 	root.mnt    = &mnt_ns->root->mnt;
 	root.dentry = mnt_ns->root->mnt.mnt_root;
 	path_get(&root);
 	while(d_mountpoint(root.dentry) && follow_down_one(&root))
 		;
 	/* Update the pwd and root */
 	set_fs_pwd(fs, &root);
 	set_fs_root(fs, &root);
 	path_put(&root);
 	return 0;
 }
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
 };

File was created	1	#include <linux/mount.h>
	2	#include <linux/file.h>
	3	#include <linux/fs.h>
	4	#include <linux/proc_ns.h>
	5	#include <linux/magic.h>
	6	#include <linux/ktime.h>
	7
	8	static struct vfsmount *nsfs_mnt;
	9
	10	static const struct file_operations ns_file_operations = {
	11	.llseek = no_llseek,
	12	};
	13
	14	static char ns_dname(struct dentry dentry, char *buffer, int buflen)
	15	{
	16	struct inode *inode = dentry->d_inode;
	17	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
	18
	19	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
	20	ns_ops->name, inode->i_ino);
	21	}
	22
	23	static void ns_prune_dentry(struct dentry *dentry)
	24	{
	25	struct inode *inode = dentry->d_inode;
	26	if (inode) {
	27	struct ns_common *ns = inode->i_private;
	28	atomic_long_set(&ns->stashed, 0);
	29	}
	30	}
	31
	32	const struct dentry_operations ns_dentry_operations =
	33	{
	34	.d_prune = ns_prune_dentry,
	35	.d_delete = always_delete_dentry,
	36	.d_dname = ns_dname,
	37	};
	38
	39	static void nsfs_evict(struct inode *inode)
	40	{
	41	struct ns_common *ns = inode->i_private;
	42	clear_inode(inode);
	43	ns->ops->put(ns);
	44	}
	45
	46	void ns_get_path(struct path path, struct task_struct *task,
	47	const struct proc_ns_operations *ns_ops)
	48	{
	49	struct vfsmount *mnt = mntget(nsfs_mnt);
	50	struct qstr qname = { .name = "", };
	51	struct dentry *dentry;
	52	struct inode *inode;
	53	struct ns_common *ns;
	54	unsigned long d;
	55
	56	again:
	57	ns = ns_ops->get(task);
	58	if (!ns) {
	59	mntput(mnt);
	60	return ERR_PTR(-ENOENT);
	61	}
	62	rcu_read_lock();
	63	d = atomic_long_read(&ns->stashed);
	64	if (!d)
	65	goto slow;
	66	dentry = (struct dentry *)d;
	67	if (!lockref_get_not_dead(&dentry->d_lockref))
	68	goto slow;
	69	rcu_read_unlock();
	70	ns_ops->put(ns);
	71	got_it:
	72	path->mnt = mnt;
	73	path->dentry = dentry;
	74	return NULL;
	75	slow:
	76	rcu_read_unlock();
	77	inode = new_inode_pseudo(mnt->mnt_sb);
	78	if (!inode) {
	79	ns_ops->put(ns);
	80	mntput(mnt);
	81	return ERR_PTR(-ENOMEM);
	82	}
	83	inode->i_ino = ns->inum;
	84	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
	85	inode->i_flags \|= S_IMMUTABLE;
	86	inode->i_mode = S_IFREG \| S_IRUGO;
	87	inode->i_fop = &ns_file_operations;
	88	inode->i_private = ns;
	89
	90	dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
	91	if (!dentry) {
	92	iput(inode);
	93	mntput(mnt);
	94	return ERR_PTR(-ENOMEM);
	95	}
	96	d_instantiate(dentry, inode);
	97	dentry->d_fsdata = (void *)ns_ops;
	98	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
	99	if (d) {
	100	d_delete(dentry); /* make sure ->d_prune() does nothing */
	101	dput(dentry);
	102	cpu_relax();
	103	goto again;
	104	}
	105	goto got_it;
	106	}
	107
	108	int ns_get_name(char buf, size_t size, struct task_struct task,
	109	const struct proc_ns_operations *ns_ops)
	110	{
	111	struct ns_common *ns;
	112	int res = -ENOENT;
	113	ns = ns_ops->get(task);
	114	if (ns) {
	115	res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
	116	ns_ops->put(ns);
	117	}
	118	return res;
	119	}
	120
	121	struct file *proc_ns_fget(int fd)
	122	{
	123	struct file *file;
	124
	125	file = fget(fd);
	126	if (!file)
	127	return ERR_PTR(-EBADF);
	128
	129	if (file->f_op != &ns_file_operations)
	130	goto out_invalid;
	131
	132	return file;
	133
	134	out_invalid:
	135	fput(file);
	136	return ERR_PTR(-EINVAL);
	137	}
	138
	139	static const struct super_operations nsfs_ops = {
	140	.statfs = simple_statfs,
	141	.evict_inode = nsfs_evict,
	142	};
	143	static struct dentry nsfs_mount(struct file_system_type fs_type,
	144	int flags, const char dev_name, void data)
	145	{
	146	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
	147	&ns_dentry_operations, NSFS_MAGIC);
	148	}
	149	static struct file_system_type nsfs = {
	150	.name = "nsfs",
	151	.mount = nsfs_mount,
	152	.kill_sb = kill_anon_super,
	153	};
	154
	155	void __init nsfs_init(void)
	156	{
	157	nsfs_mnt = kern_mount(&nsfs);
	158	if (IS_ERR(nsfs_mnt))
	159	panic("can't set nsfs up\n");
	160	nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
	161	}
	162

 /*
  *  linux/fs/proc/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/pid_namespace.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/completion.h>
 #include <linux/poll.h>
 #include <linux/printk.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
-	struct ns_common *ns;
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	/* Stop tracking associated processes */
 	put_pid(PROC_I(inode)->pid);
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
 	if (de)
 		pde_put(de);
 	head = PROC_I(inode)->sysctl;
 	if (head) {
 		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
 		sysctl_head_put(head);
 	}
-	/* Release any associated namespace */
-	ns = PROC_I(inode)->ns.ns;
-	if (ns && ns->ops)
-		ns->ops->put(ns);
 }
 static struct kmem_cache * proc_inode_cachep;
 static struct inode *proc_alloc_inode(struct super_block *sb)
 {
 	struct proc_inode *ei;
 	struct inode *inode;
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->pid = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
 	ei->ns.ns = NULL;
 	ei->ns.ns_ops = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
 }
 static void proc_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 static void proc_destroy_inode(struct inode *inode)
 {
 	call_rcu(&inode->i_rcu, proc_i_callback);
 }
 static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 	inode_init_once(&ei->vfs_inode);
 }
 void __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD|SLAB_PANIC),
 					     init_once);
 }
 static int proc_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct super_block *sb = root->d_sb;
 	struct pid_namespace *pid = sb->s_fs_info;
 	if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
 		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
 	if (pid->hide_pid != 0)
 		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
 	return 0;
 }
 static const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
 	.destroy_inode	= proc_destroy_inode,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= proc_evict_inode,
 	.statfs		= simple_statfs,
 	.remount_fs	= proc_remount,
 	.show_options	= proc_show_options,
 };
 enum {BIAS = -1U<<31};
 static inline int use_pde(struct proc_dir_entry *pde)
 {
 	return atomic_inc_unless_negative(&pde->in_use);
 }
 static void unuse_pde(struct proc_dir_entry *pde)
 {
 	if (atomic_dec_return(&pde->in_use) == BIAS)
 		complete(pde->pde_unload_completion);
 }
 /* pde is locked */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
 	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
 		DECLARE_COMPLETION_ONSTACK(c);
 		pdeo->c = &c;
 		spin_unlock(&pde->pde_unload_lock);
 		wait_for_completion(&c);
 		spin_lock(&pde->pde_unload_lock);
 	} else {
 		struct file *file;
 		pdeo->closing = 1;
 		spin_unlock(&pde->pde_unload_lock);
 		file = pdeo->file;
 		pde->proc_fops->release(file_inode(file), file);
 		spin_lock(&pde->pde_unload_lock);
 		list_del_init(&pdeo->lh);
 		if (pdeo->c)
 			complete(pdeo->c);
 		kfree(pdeo);
 	}
 }
 void proc_entry_rundown(struct proc_dir_entry *de)
 {
 	DECLARE_COMPLETION_ONSTACK(c);
 	/* Wait until all existing callers into module are done. */
 	de->pde_unload_completion = &c;
 	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
 		wait_for_completion(&c);
 	spin_lock(&de->pde_unload_lock);
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
 		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
 		close_pdeo(de, pdeo);
 	}
 	spin_unlock(&de->pde_unload_lock);
 }
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	loff_t rv = -EINVAL;
 	if (use_pde(pde)) {
 		loff_t (*llseek)(struct file *, loff_t, int);
 		llseek = pde->proc_fops->llseek;
 		if (!llseek)
 			llseek = default_llseek;
 		rv = llseek(file, offset, whence);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
 	if (use_pde(pde)) {
 		read = pde->proc_fops->read;
 		if (read)
 			rv = read(file, buf, count, ppos);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
 	if (use_pde(pde)) {
 		write = pde->proc_fops->write;
 		if (write)
 			rv = write(file, buf, count, ppos);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
 	if (use_pde(pde)) {
 		poll = pde->proc_fops->poll;
 		if (poll)
 			rv = poll(file, pts);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*ioctl)(struct file *, unsigned int, unsigned long);
 	if (use_pde(pde)) {
 		ioctl = pde->proc_fops->unlocked_ioctl;
 		if (ioctl)
 			rv = ioctl(file, cmd, arg);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 #ifdef CONFIG_COMPAT
 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
 	if (use_pde(pde)) {
 		compat_ioctl = pde->proc_fops->compat_ioctl;
 		if (compat_ioctl)
 			rv = compat_ioctl(file, cmd, arg);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 #endif
 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	int rv = -EIO;
 	int (*mmap)(struct file *, struct vm_area_struct *);
 	if (use_pde(pde)) {
 		mmap = pde->proc_fops->mmap;
 		if (mmap)
 			rv = mmap(file, vma);
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static unsigned long
 proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
 			   unsigned long len, unsigned long pgoff,
 			   unsigned long flags)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned long rv = -EIO;
 	if (use_pde(pde)) {
 		typeof(proc_reg_get_unmapped_area) *get_area;
 		get_area = pde->proc_fops->get_unmapped_area;
 #ifdef CONFIG_MMU
 		if (!get_area)
 			get_area = current->mm->get_unmapped_area;
 #endif
 		if (get_area)
 			rv = get_area(file, orig_addr, len, pgoff, flags);
 		else
 			rv = orig_addr;
 		unuse_pde(pde);
 	}
 	return rv;
 }
 static int proc_reg_open(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
 	int (*open)(struct inode *, struct file *);
 	int (*release)(struct inode *, struct file *);
 	struct pde_opener *pdeo;
 	/*
 	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
 	 * sequence. ->release won't be called because ->proc_fops will be
 	 * cleared. Depending on complexity of ->release, consequences vary.
 	 *
 	 * We can't wait for mercy when close will be done for real, it's
 	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
 	 * by hand in remove_proc_entry(). For this, save opener's credentials
 	 * for later.
 	 */
 	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
 	if (!pdeo)
 		return -ENOMEM;
 	if (!use_pde(pde)) {
 		kfree(pdeo);
 		return -ENOENT;
 	}
 	open = pde->proc_fops->open;
 	release = pde->proc_fops->release;
 	if (open)
 		rv = open(inode, file);
 	if (rv == 0 && release) {
 		/* To know what to release. */
 		pdeo->file = file;
 		/* Strictly for "too late" ->release in proc_reg_release(). */
 		spin_lock(&pde->pde_unload_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
 		spin_unlock(&pde->pde_unload_lock);
 	} else
 		kfree(pdeo);
 	unuse_pde(pde);
 	return rv;
 }
 static int proc_reg_release(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	struct pde_opener *pdeo;
 	spin_lock(&pde->pde_unload_lock);
 	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
 		if (pdeo->file == file) {
 			close_pdeo(pde, pdeo);
 			break;
 		}
 	}
 	spin_unlock(&pde->pde_unload_lock);
 	return 0;
 }
 static const struct file_operations proc_reg_file_ops = {
 	.llseek		= proc_reg_llseek,
 	.read		= proc_reg_read,
 	.write		= proc_reg_write,
 	.poll		= proc_reg_poll,
 	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= proc_reg_compat_ioctl,
 #endif
 	.mmap		= proc_reg_mmap,
 	.get_unmapped_area = proc_reg_get_unmapped_area,
 	.open		= proc_reg_open,
 	.release	= proc_reg_release,
 };
 #ifdef CONFIG_COMPAT
 static const struct file_operations proc_reg_file_ops_no_compat = {
 	.llseek		= proc_reg_llseek,
 	.read		= proc_reg_read,
 	.write		= proc_reg_write,
 	.poll		= proc_reg_poll,
 	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
 	.mmap		= proc_reg_mmap,
 	.get_unmapped_area = proc_reg_get_unmapped_area,
 	.open		= proc_reg_open,
 	.release	= proc_reg_release,
 };
 #endif
 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 {
 	struct inode *inode = new_inode_pseudo(sb);
 	if (inode) {
 		inode->i_ino = de->low_ino;
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->pde = de;
 		if (de->mode) {
 			inode->i_mode = de->mode;
 			inode->i_uid = de->uid;
 			inode->i_gid = de->gid;
 		}
 		if (de->size)
 			inode->i_size = de->size;
 		if (de->nlink)
 			set_nlink(inode, de->nlink);
 		WARN_ON(!de->proc_iops);
 		inode->i_op = de->proc_iops;
 		if (de->proc_fops) {
 			if (S_ISREG(inode->i_mode)) {
 #ifdef CONFIG_COMPAT
 				if (!de->proc_fops->compat_ioctl)
 					inode->i_fop =
 						&proc_reg_file_ops_no_compat;
 				else
 #endif
 					inode->i_fop = &proc_reg_file_ops;
 			} else {
 				inode->i_fop = de->proc_fops;
 			}
 		}
 	} else
 	       pde_put(de);
 	return inode;
 }
 int proc_fill_super(struct super_block *s)
 {
 	struct inode *root_inode;
 	int ret;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = PROC_SUPER_MAGIC;
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
 	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, &proc_root);
 	if (!root_inode) {
 		pr_err("proc_fill_super: get root inode failed\n");
 		return -ENOMEM;
 	}
 	s->s_root = d_make_root(root_inode);
 	if (!s->s_root) {
 		pr_err("proc_fill_super: allocate dentry failed\n");
 		return -ENOMEM;
 	}
 	ret = proc_setup_self(s);
 	if (ret) {
 		return ret;
 	}
 	return proc_setup_thread_self(s);
 }

1	#include <linux/proc_fs.h>	1	#include <linux/proc_fs.h>
2	#include <linux/nsproxy.h>	2	#include <linux/nsproxy.h>
3	#include <linux/sched.h>
4	#include <linux/ptrace.h>	3	#include <linux/ptrace.h>
5	#include <linux/fs_struct.h>
6	#include <linux/mount.h>
7	#include <linux/path.h>
8	#include <linux/namei.h>	4	#include <linux/namei.h>
9	#include <linux/file.h>	5	#include <linux/file.h>
10	#include <linux/utsname.h>	6	#include <linux/utsname.h>
11	#include <net/net_namespace.h>	7	#include <net/net_namespace.h>
12	#include <linux/ipc_namespace.h>	8	#include <linux/ipc_namespace.h>
13	#include <linux/pid_namespace.h>	9	#include <linux/pid_namespace.h>
14	#include <linux/user_namespace.h>	10	#include <linux/user_namespace.h>
15	#include "internal.h"	11	#include "internal.h"
16		12
17		13
18	static const struct proc_ns_operations *ns_entries[] = {	14	static const struct proc_ns_operations *ns_entries[] = {
19	#ifdef CONFIG_NET_NS	15	#ifdef CONFIG_NET_NS
20	&netns_operations,	16	&netns_operations,
21	#endif	17	#endif
22	#ifdef CONFIG_UTS_NS	18	#ifdef CONFIG_UTS_NS
23	&utsns_operations,	19	&utsns_operations,
24	#endif	20	#endif
25	#ifdef CONFIG_IPC_NS	21	#ifdef CONFIG_IPC_NS
26	&ipcns_operations,	22	&ipcns_operations,
27	#endif	23	#endif
28	#ifdef CONFIG_PID_NS	24	#ifdef CONFIG_PID_NS
29	&pidns_operations,	25	&pidns_operations,
30	#endif	26	#endif
31	#ifdef CONFIG_USER_NS	27	#ifdef CONFIG_USER_NS
32	&userns_operations,	28	&userns_operations,
33	#endif	29	#endif
34	&mntns_operations,	30	&mntns_operations,
35	};	31	};
36		32
37	static const struct file_operations ns_file_operations = {
38	.llseek = no_llseek,
39	};
40
41	static const struct inode_operations ns_inode_operations = {
42	.setattr = proc_setattr,
43	};
44
45	static char ns_dname(struct dentry dentry, char *buffer, int buflen)
46	{
47	struct inode *inode = dentry->d_inode;
48	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
49
50	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
51	ns_ops->name, inode->i_ino);
52	}
53
54	const struct dentry_operations ns_dentry_operations =
55	{
56	.d_delete = always_delete_dentry,
57	.d_dname = ns_dname,
58	};
59
60	static struct dentry proc_ns_get_dentry(struct super_block sb,
61	struct task_struct task, const struct proc_ns_operations ns_ops)
62	{
63	struct dentry dentry, result;
64	struct inode *inode;
65	struct proc_inode *ei;
66	struct qstr qname = { .name = "", };
67	struct ns_common *ns;
68
69	ns = ns_ops->get(task);
70	if (!ns)
71	return ERR_PTR(-ENOENT);
72
73	dentry = d_alloc_pseudo(sb, &qname);
74	if (!dentry) {
75	ns_ops->put(ns);
76	return ERR_PTR(-ENOMEM);
77	}
78	dentry->d_fsdata = (void *)ns_ops;
79
80	inode = iget_locked(sb, ns->inum);
81	if (!inode) {
82	dput(dentry);
83	ns_ops->put(ns);
84	return ERR_PTR(-ENOMEM);
85	}
86
87	ei = PROC_I(inode);
88	if (inode->i_state & I_NEW) {
89	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
90	inode->i_op = &ns_inode_operations;
91	inode->i_mode = S_IFREG \| S_IRUGO;
92	inode->i_fop = &ns_file_operations;
93	ei->ns.ns_ops = ns_ops;
94	ei->ns.ns = ns;
95	unlock_new_inode(inode);
96	} else {
97	ns_ops->put(ns);
98	}
99
100	d_set_d_op(dentry, &ns_dentry_operations);
101	result = d_instantiate_unique(dentry, inode);
102	if (result) {
103	dput(dentry);
104	dentry = result;
105	}
106
107	return dentry;
108	}
109
110	static void proc_ns_follow_link(struct dentry dentry, struct nameidata *nd)	33	static void proc_ns_follow_link(struct dentry dentry, struct nameidata *nd)
111	{	34	{
112	struct inode *inode = dentry->d_inode;	35	struct inode *inode = dentry->d_inode;
113	struct super_block *sb = inode->i_sb;	36	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
114	struct proc_inode *ei = PROC_I(inode);
115	struct task_struct *task;	37	struct task_struct *task;
116	struct path ns_path;	38	struct path ns_path;
117	void *error = ERR_PTR(-EACCES);	39	void *error = ERR_PTR(-EACCES);
118		40
119	task = get_proc_task(inode);	41	task = get_proc_task(inode);
120	if (!task)	42	if (!task)
121	goto out;	43	return error;
122		44
123	if (!ptrace_may_access(task, PTRACE_MODE_READ))	45	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
124	goto out_put_task;	46	error = ns_get_path(&ns_path, task, ns_ops);
125		47	if (!error)
126	ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);	48	nd_jump_link(nd, &ns_path);
127	if (IS_ERR(ns_path.dentry)) {
128	error = ERR_CAST(ns_path.dentry);
129	goto out_put_task;
130	}	49	}
131
132	ns_path.mnt = mntget(nd->path.mnt);
133	nd_jump_link(nd, &ns_path);
134	error = NULL;
135
136	out_put_task:
137	put_task_struct(task);	50	put_task_struct(task);
138	out:
139	return error;	51	return error;
140	}	52	}
141		53
142	static int proc_ns_readlink(struct dentry dentry, char __user buffer, int buflen)	54	static int proc_ns_readlink(struct dentry dentry, char __user buffer, int buflen)
143	{	55	{
144	struct inode *inode = dentry->d_inode;	56	struct inode *inode = dentry->d_inode;
145	struct proc_inode *ei = PROC_I(inode);	57	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
146	const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
147	struct task_struct *task;	58	struct task_struct *task;
148	struct ns_common *ns;
149	char name[50];	59	char name[50];
150	int res = -EACCES;	60	int res = -EACCES;
151		61
152	task = get_proc_task(inode);	62	task = get_proc_task(inode);
153	if (!task)	63	if (!task)
154	goto out;	64	return res;
155		65
156	if (!ptrace_may_access(task, PTRACE_MODE_READ))	66	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
157	goto out_put_task;	67	res = ns_get_name(name, sizeof(name), task, ns_ops);
158		68	if (res >= 0)
159	res = -ENOENT;	69	res = readlink_copy(buffer, buflen, name);
160	ns = ns_ops->get(task);	70	}
161	if (!ns)
162	goto out_put_task;
163
164	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns->inum);
165	res = readlink_copy(buffer, buflen, name);
166	ns_ops->put(ns);
167	out_put_task:
168	put_task_struct(task);	71	put_task_struct(task);
169	out:
170	return res;	72	return res;
171	}	73	}
172		74
173	static const struct inode_operations proc_ns_link_inode_operations = {	75	static const struct inode_operations proc_ns_link_inode_operations = {
174	.readlink = proc_ns_readlink,	76	.readlink = proc_ns_readlink,
175	.follow_link = proc_ns_follow_link,	77	.follow_link = proc_ns_follow_link,
176	.setattr = proc_setattr,	78	.setattr = proc_setattr,
177	};	79	};
178		80
179	static int proc_ns_instantiate(struct inode *dir,	81	static int proc_ns_instantiate(struct inode *dir,
180	struct dentry dentry, struct task_struct task, const void *ptr)	82	struct dentry dentry, struct task_struct task, const void *ptr)
181	{	83	{
182	const struct proc_ns_operations *ns_ops = ptr;	84	const struct proc_ns_operations *ns_ops = ptr;
183	struct inode *inode;	85	struct inode *inode;
184	struct proc_inode *ei;	86	struct proc_inode *ei;
185		87
186	inode = proc_pid_make_inode(dir->i_sb, task);	88	inode = proc_pid_make_inode(dir->i_sb, task);
187	if (!inode)	89	if (!inode)
188	goto out;	90	goto out;
189		91
190	ei = PROC_I(inode);	92	ei = PROC_I(inode);
191	inode->i_mode = S_IFLNK\|S_IRWXUGO;	93	inode->i_mode = S_IFLNK\|S_IRWXUGO;
192	inode->i_op = &proc_ns_link_inode_operations;	94	inode->i_op = &proc_ns_link_inode_operations;
193	ei->ns.ns_ops = ns_ops;	95	ei->ns.ns_ops = ns_ops;
194		96
195	d_set_d_op(dentry, &pid_dentry_operations);	97	d_set_d_op(dentry, &pid_dentry_operations);
196	d_add(dentry, inode);	98	d_add(dentry, inode);
197	/* Close the race of the process dying before we return the dentry */	99	/* Close the race of the process dying before we return the dentry */
198	if (pid_revalidate(dentry, 0))	100	if (pid_revalidate(dentry, 0))
199	return 0;	101	return 0;
200	out:	102	out:
201	return -ENOENT;	103	return -ENOENT;
202	}	104	}
203		105
204	static int proc_ns_dir_readdir(struct file file, struct dir_context ctx)	106	static int proc_ns_dir_readdir(struct file file, struct dir_context ctx)
205	{	107	{
206	struct task_struct *task = get_proc_task(file_inode(file));	108	struct task_struct *task = get_proc_task(file_inode(file));
207	const struct proc_ns_operations entry, last;	109	const struct proc_ns_operations entry, last;
208		110
209	if (!task)	111	if (!task)
210	return -ENOENT;	112	return -ENOENT;
211		113
212	if (!dir_emit_dots(file, ctx))	114	if (!dir_emit_dots(file, ctx))
213	goto out;	115	goto out;
214	if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))	116	if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
215	goto out;	117	goto out;
216	entry = ns_entries + (ctx->pos - 2);	118	entry = ns_entries + (ctx->pos - 2);
217	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];	119	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
218	while (entry <= last) {	120	while (entry <= last) {
219	const struct proc_ns_operations ops = entry;	121	const struct proc_ns_operations ops = entry;
220	if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),	122	if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
221	proc_ns_instantiate, task, ops))	123	proc_ns_instantiate, task, ops))
222	break;	124	break;
223	ctx->pos++;	125	ctx->pos++;
224	entry++;	126	entry++;
225	}	127	}
226	out:	128	out:
227	put_task_struct(task);	129	put_task_struct(task);
228	return 0;	130	return 0;
229	}	131	}
230		132
231	const struct file_operations proc_ns_dir_operations = {	133	const struct file_operations proc_ns_dir_operations = {
232	.read = generic_read_dir,	134	.read = generic_read_dir,
233	.iterate = proc_ns_dir_readdir,	135	.iterate = proc_ns_dir_readdir,
234	};	136	};
235		137
236	static struct dentry proc_ns_dir_lookup(struct inode dir,	138	static struct dentry proc_ns_dir_lookup(struct inode dir,
237	struct dentry *dentry, unsigned int flags)	139	struct dentry *dentry, unsigned int flags)
238	{	140	{
239	int error;	141	int error;
240	struct task_struct *task = get_proc_task(dir);	142	struct task_struct *task = get_proc_task(dir);
241	const struct proc_ns_operations entry, last;	143	const struct proc_ns_operations entry, last;
242	unsigned int len = dentry->d_name.len;	144	unsigned int len = dentry->d_name.len;
243		145
244	error = -ENOENT;	146	error = -ENOENT;
245		147
246	if (!task)	148	if (!task)
247	goto out_no_task;	149	goto out_no_task;
248		150
249	last = &ns_entries[ARRAY_SIZE(ns_entries)];	151	last = &ns_entries[ARRAY_SIZE(ns_entries)];
250	for (entry = ns_entries; entry < last; entry++) {	152	for (entry = ns_entries; entry < last; entry++) {
251	if (strlen((*entry)->name) != len)	153	if (strlen((*entry)->name) != len)
252	continue;	154	continue;
253	if (!memcmp(dentry->d_name.name, (*entry)->name, len))	155	if (!memcmp(dentry->d_name.name, (*entry)->name, len))
254	break;	156	break;
255	}	157	}
256	if (entry == last)	158	if (entry == last)
257	goto out;	159	goto out;
258		160
259	error = proc_ns_instantiate(dir, dentry, task, *entry);	161	error = proc_ns_instantiate(dir, dentry, task, *entry);
260	out:	162	out:
261	put_task_struct(task);	163	put_task_struct(task);
262	out_no_task:	164	out_no_task:
263	return ERR_PTR(error);	165	return ERR_PTR(error);
264	}	166	}
265		167
266	const struct inode_operations proc_ns_dir_inode_operations = {	168	const struct inode_operations proc_ns_dir_inode_operations = {
267	.lookup = proc_ns_dir_lookup,	169	.lookup = proc_ns_dir_lookup,
268	.getattr = pid_getattr,	170	.getattr = pid_getattr,
269	.setattr = proc_setattr,	171	.setattr = proc_setattr,
270	};	172	};
271
272	struct file *proc_ns_fget(int fd)
273	{
274	struct file *file;
275
276	file = fget(fd);
277	if (!file)
278	return ERR_PTR(-EBADF);
279
280	if (file->f_op != &ns_file_operations)
281	goto out_invalid;
282
283	return file;
284
285	out_invalid:
286	fput(file);
287	return ERR_PTR(-EINVAL);
288	}
289
290	struct ns_common get_proc_ns(struct inode inode)
291	{
292	return PROC_I(inode)->ns.ns;
293	}
294
295	bool proc_ns_inode(struct inode *inode)
296	{
297	return inode->i_fop == &ns_file_operations;
298	}
299		173

 /*
  * procfs namespace bits
  */
 #ifndef _LINUX_PROC_NS_H
 #define _LINUX_PROC_NS_H
+#include <linux/ns_common.h>
 struct pid_namespace;
 struct nsproxy;
-struct ns_common;
+struct path;
 struct proc_ns_operations {
 	const char *name;
 	int type;
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
 };
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 /*
  * We always define these enumerators
  */
 enum {
 	PROC_ROOT_INO		= 1,
 	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
 	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
 	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
 	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
 };
 #ifdef CONFIG_PROC_FS
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
-extern struct file *proc_ns_fget(int fd);
-extern struct ns_common *get_proc_ns(struct inode *);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
-extern bool proc_ns_inode(struct inode *inode);
 #else /* CONFIG_PROC_FS */
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
 static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
-static inline struct file *proc_ns_fget(int fd)
-{
-	return ERR_PTR(-EINVAL);
-}
-static inline struct ns_common *get_proc_ns(struct inode *inode) { return NULL; }
 static inline int proc_alloc_inum(unsigned int *inum)
 {
 	*inum = 1;
 	return 0;
 }
 static inline void proc_free_inum(unsigned int inum) {}
-static inline bool proc_ns_inode(struct inode *inode) { return false; }
 #endif /* CONFIG_PROC_FS */
-#define ns_alloc_inum(ns) proc_alloc_inum(&(ns)->inum)
+static inline int ns_alloc_inum(struct ns_common *ns)
+{
+	atomic_long_set(&ns->stashed, 0);
+	return proc_alloc_inum(&ns->inum);
+}
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)

 #ifndef __LINUX_MAGIC_H__
 #define __LINUX_MAGIC_H__
 #define ADFS_SUPER_MAGIC	0xadf5
 #define AFFS_SUPER_MAGIC	0xadff
 #define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC	0x0187
 #define CODA_SUPER_MAGIC	0x73757245
 #define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
 #define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define DEBUGFS_MAGIC          0x64626720
 #define SECURITYFS_MAGIC	0x73636673
 #define SELINUX_MAGIC		0xf97cff8c
 #define SMACK_MAGIC		0x43415d53	/* "SMAC" */
 #define RAMFS_MAGIC		0x858458f6	/* some random number */
 #define TMPFS_MAGIC		0x01021994
 #define HUGETLBFS_MAGIC 	0x958458f6	/* some random number */
 #define SQUASHFS_MAGIC		0x73717368
 #define ECRYPTFS_SUPER_MAGIC	0xf15f
 #define EFS_SUPER_MAGIC		0x414A53
 #define EXT2_SUPER_MAGIC	0xEF53
 #define EXT3_SUPER_MAGIC	0xEF53
 #define XENFS_SUPER_MAGIC	0xabba1974
 #define EXT4_SUPER_MAGIC	0xEF53
 #define BTRFS_SUPER_MAGIC	0x9123683E
 #define NILFS_SUPER_MAGIC	0x3434
 #define F2FS_SUPER_MAGIC	0xF2F52010
 #define HPFS_SUPER_MAGIC	0xf995e849
 #define ISOFS_SUPER_MAGIC	0x9660
 #define JFFS2_SUPER_MAGIC	0x72b6
 #define PSTOREFS_MAGIC		0x6165676C
 #define EFIVARFS_MAGIC		0xde5e81e4
 #define HOSTFS_SUPER_MAGIC	0x00c0ffee
 #define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
 #define MINIX2_SUPER_MAGIC	0x2468		/* minix v2 fs, 14 char names */
 #define MINIX2_SUPER_MAGIC2	0x2478		/* minix v2 fs, 30 char names */
 #define MINIX3_SUPER_MAGIC	0x4d5a		/* minix v3 fs, 60 char names */
 #define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
 #define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
 #define NFS_SUPER_MAGIC		0x6969
 #define OPENPROM_SUPER_MAGIC	0x9fa1
 #define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
 #define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
 #define REISERFS_SUPER_MAGIC	0x52654973	/* used by gcc */
 					/* used by file system utilities that
 	                                   look at the superblock, etc.  */
 #define REISERFS_SUPER_MAGIC_STRING	"ReIsErFs"
 #define REISER2FS_SUPER_MAGIC_STRING	"ReIsEr2Fs"
 #define REISER2FS_JR_SUPER_MAGIC_STRING	"ReIsEr3Fs"
 #define SMB_SUPER_MAGIC		0x517B
 #define CGROUP_SUPER_MAGIC	0x27e0eb
 #define STACK_END_MAGIC		0x57AC6E9D
 #define V9FS_MAGIC		0x01021997
 #define BDEVFS_MAGIC            0x62646576
 #define BINFMTFS_MAGIC          0x42494e4d
 #define DEVPTS_SUPER_MAGIC	0x1cd1
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define PIPEFS_MAGIC            0x50495045
 #define PROC_SUPER_MAGIC	0x9fa0
 #define SOCKFS_MAGIC		0x534F434B
 #define SYSFS_MAGIC		0x62656572
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
 #define MTD_INODE_FS_MAGIC      0x11307854
 #define ANON_INODE_FS_MAGIC	0x09041934
 #define BTRFS_TEST_MAGIC	0x73727279
+#define NSFS_MAGIC		0x6e736673
 #endif /* __LINUX_MAGIC_H__ */

 /*
  *  linux/init/main.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  GK 2/5/95  -  Changed to support mounting root fs via NFS
  *  Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
  *  Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
  *  Simplified starting of init:  Michael A. Griffith <grif@acm.org>
  */
 #define DEBUG		/* Enable initcall_debug */
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/stackprotector.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/tty.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel_stat.h>
 #include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/smp.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/buffer_head.h>
 #include <linux/page_cgroup.h>
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
 #include <linux/kmemleak.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/file.h>
 #include <linux/ptrace.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
 #include <linux/context_tracking.h>
 #include <linux/random.h>
 #include <linux/list.h>
+#include <linux/proc_ns.h>
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
 #endif
 static int kernel_init(void *);
 extern void init_IRQ(void);
 extern void fork_init(unsigned long);
 extern void radix_tree_init(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
 /*
  * Debug helper: via this flag we know that we are in 'early bootup code'
  * where only the boot processor is running with IRQ disabled.  This means
  * two things - IRQ must not be enabled before the flag is cleared and some
  * operations which are not allowed with IRQ disabled are allowed while the
  * flag is set.
  */
 bool early_boot_irqs_disabled __read_mostly;
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 /*
  * Boot command-line arguments
  */
 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
 extern void time_init(void);
 /* Default late time init is NULL. archs can override this later. */
 void (*__initdata late_time_init)(void);
 /* Untouched command line saved by arch-specific code. */
 char __initdata boot_command_line[COMMAND_LINE_SIZE];
 /* Untouched saved command line (eg. for /proc) */
 char *saved_command_line;
 /* Command line for parameter parsing */
 static char *static_command_line;
 /* Command line for per-initcall parameter parsing */
 static char *initcall_command_line;
 static char *execute_command;
 static char *ramdisk_execute_command;
 /*
  * Used to generate warnings if static_key manipulation functions are used
  * before jump_label_init is called.
  */
 bool static_key_initialized __read_mostly;
 EXPORT_SYMBOL_GPL(static_key_initialized);
 /*
  * If set, this is an indication to the drivers that reset the underlying
  * device before going ahead with the initialization otherwise driver might
  * rely on the BIOS and skip the reset operation.
  *
  * This is useful if kernel is booting in an unreliable environment.
  * For ex. kdump situaiton where previous kernel has crashed, BIOS has been
  * skipped and devices will be in unknown state.
  */
 unsigned int reset_devices;
 EXPORT_SYMBOL(reset_devices);
 static int __init set_reset_devices(char *str)
 {
 	reset_devices = 1;
 	return 1;
 }
 __setup("reset_devices", set_reset_devices);
 static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
 const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
 static const char *panic_later, *panic_param;
 extern const struct obs_kernel_param __setup_start[], __setup_end[];
 static int __init obsolete_checksetup(char *line)
 {
 	const struct obs_kernel_param *p;
 	int had_early_param = 0;
 	p = __setup_start;
 	do {
 		int n = strlen(p->str);
 		if (parameqn(line, p->str, n)) {
 			if (p->early) {
 				/* Already done in parse_early_param?
 				 * (Needs exact match on param part).
 				 * Keep iterating, as we can have early
 				 * params and __setups of same names 8( */
 				if (line[n] == '\0' || line[n] == '=')
 					had_early_param = 1;
 			} else if (!p->setup_func) {
 				pr_warn("Parameter %s is obsolete, ignored\n",
 					p->str);
 				return 1;
 			} else if (p->setup_func(line + n))
 				return 1;
 		}
 		p++;
 	} while (p < __setup_end);
 	return had_early_param;
 }
 /*
  * This should be approx 2 Bo*oMips to start (note initial shift), and will
  * still work even if initially too large, it will just take slightly longer
  */
 unsigned long loops_per_jiffy = (1<<12);
 EXPORT_SYMBOL(loops_per_jiffy);
 static int __init debug_kernel(char *str)
 {
 	console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
 	return 0;
 }
 static int __init quiet_kernel(char *str)
 {
 	console_loglevel = CONSOLE_LOGLEVEL_QUIET;
 	return 0;
 }
 early_param("debug", debug_kernel);
 early_param("quiet", quiet_kernel);
 static int __init loglevel(char *str)
 {
 	int newlevel;
 	/*
 	 * Only update loglevel value when a correct setting was passed,
 	 * to prevent blind crashes (when loglevel being set to 0) that
 	 * are quite hard to debug
 	 */
 	if (get_option(&str, &newlevel)) {
 		console_loglevel = newlevel;
 		return 0;
 	}
 	return -EINVAL;
 }
 early_param("loglevel", loglevel);
 /* Change NUL term back to "=", to make "param" the whole string. */
 static int __init repair_env_string(char *param, char *val, const char *unused)
 {
 	if (val) {
 		/* param=val or param="val"? */
 		if (val == param+strlen(param)+1)
 			val[-1] = '=';
 		else if (val == param+strlen(param)+2) {
 			val[-2] = '=';
 			memmove(val-1, val, strlen(val)+1);
 			val--;
 		} else
 			BUG();
 	}
 	return 0;
 }
 /* Anything after -- gets handed straight to init. */
 static int __init set_init_arg(char *param, char *val, const char *unused)
 {
 	unsigned int i;
 	if (panic_later)
 		return 0;
 	repair_env_string(param, val, unused);
 	for (i = 0; argv_init[i]; i++) {
 		if (i == MAX_INIT_ARGS) {
 			panic_later = "init";
 			panic_param = param;
 			return 0;
 		}
 	}
 	argv_init[i] = param;
 	return 0;
 }
 /*
  * Unknown boot options get handed to init, unless they look like
  * unused parameters (modprobe will find them in /proc/cmdline).
  */
 static int __init unknown_bootoption(char *param, char *val, const char *unused)
 {
 	repair_env_string(param, val, unused);
 	/* Handle obsolete-style parameters */
 	if (obsolete_checksetup(param))
 		return 0;
 	/* Unused module parameter. */
 	if (strchr(param, '.') && (!val || strchr(param, '.') < val))
 		return 0;
 	if (panic_later)
 		return 0;
 	if (val) {
 		/* Environment option */
 		unsigned int i;
 		for (i = 0; envp_init[i]; i++) {
 			if (i == MAX_INIT_ENVS) {
 				panic_later = "env";
 				panic_param = param;
 			}
 			if (!strncmp(param, envp_init[i], val - param))
 				break;
 		}
 		envp_init[i] = param;
 	} else {
 		/* Command line option */
 		unsigned int i;
 		for (i = 0; argv_init[i]; i++) {
 			if (i == MAX_INIT_ARGS) {
 				panic_later = "init";
 				panic_param = param;
 			}
 		}
 		argv_init[i] = param;
 	}
 	return 0;
 }
 static int __init init_setup(char *str)
 {
 	unsigned int i;
 	execute_command = str;
 	/*
 	 * In case LILO is going to boot us with default command line,
 	 * it prepends "auto" before the whole cmdline which makes
 	 * the shell think it should execute a script with such name.
 	 * So we ignore all arguments entered _before_ init=... [MJ]
 	 */
 	for (i = 1; i < MAX_INIT_ARGS; i++)
 		argv_init[i] = NULL;
 	return 1;
 }
 __setup("init=", init_setup);
 static int __init rdinit_setup(char *str)
 {
 	unsigned int i;
 	ramdisk_execute_command = str;
 	/* See "auto" comment in init_setup */
 	for (i = 1; i < MAX_INIT_ARGS; i++)
 		argv_init[i] = NULL;
 	return 1;
 }
 __setup("rdinit=", rdinit_setup);
 #ifndef CONFIG_SMP
 static const unsigned int setup_max_cpus = NR_CPUS;
 #ifdef CONFIG_X86_LOCAL_APIC
 static void __init smp_init(void)
 {
 	APIC_init_uniprocessor();
 }
 #else
 #define smp_init()	do { } while (0)
 #endif
 static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 #endif
 /*
  * We need to store the untouched command line for future reference.
  * We also need to store the touched command line since the parameter
  * parsing is performed in place, and we should allow a component to
  * store reference of name/value for future reference.
  */
 static void __init setup_command_line(char *command_line)
 {
 	saved_command_line =
 		memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
 	initcall_command_line =
 		memblock_virt_alloc(strlen(boot_command_line) + 1, 0);
 	static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);
 	strcpy(saved_command_line, boot_command_line);
 	strcpy(static_command_line, command_line);
 }
 /*
  * We need to finalize in a non-__init function or else race conditions
  * between the root thread and the init thread may cause start_kernel to
  * be reaped by free_initmem before the root thread has proceeded to
  * cpu_idle.
  *
  * gcc-3.4 accidentally inlines this function, so use noinline.
  */
 static __initdata DECLARE_COMPLETION(kthreadd_done);
 static noinline void __init_refok rest_init(void)
 {
 	int pid;
 	rcu_scheduler_starting();
 	/*
 	 * We need to spawn init first so that it obtains pid 1, however
 	 * the init task will end up wanting to create kthreads, which, if
 	 * we schedule it before we create kthreadd, will OOPS.
 	 */
 	kernel_thread(kernel_init, NULL, CLONE_FS);
 	numa_default_policy();
 	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
 	rcu_read_lock();
 	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
 	rcu_read_unlock();
 	complete(&kthreadd_done);
 	/*
 	 * The boot idle thread must execute schedule()
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
 	schedule_preempt_disabled();
 	/* Call into cpu_idle with preempt disabled */
 	cpu_startup_entry(CPUHP_ONLINE);
 }
 /* Check for early params. */
 static int __init do_early_param(char *param, char *val, const char *unused)
 {
 	const struct obs_kernel_param *p;
 	for (p = __setup_start; p < __setup_end; p++) {
 		if ((p->early && parameq(param, p->str)) ||
 		    (strcmp(param, "console") == 0 &&
 		     strcmp(p->str, "earlycon") == 0)
 		) {
 			if (p->setup_func(val) != 0)
 				pr_warn("Malformed early option '%s'\n", param);
 		}
 	}
 	/* We accept everything at this stage. */
 	return 0;
 }
 void __init parse_early_options(char *cmdline)
 {
 	parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param);
 }
 /* Arch code calls this early on, or if not, just before other parsing. */
 void __init parse_early_param(void)
 {
 	static int done __initdata;
 	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
 	if (done)
 		return;
 	/* All fall through to do_early_param. */
 	strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	parse_early_options(tmp_cmdline);
 	done = 1;
 }
 /*
  *	Activate the first processor.
  */
 static void __init boot_cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
 	set_cpu_online(cpu, true);
 	set_cpu_active(cpu, true);
 	set_cpu_present(cpu, true);
 	set_cpu_possible(cpu, true);
 }
 void __init __weak smp_setup_processor_id(void)
 {
 }
 # if THREAD_SIZE >= PAGE_SIZE
 void __init __weak thread_info_cache_init(void)
 {
 }
 #endif
 /*
  * Set up kernel memory allocators
  */
 static void __init mm_init(void)
 {
 	/*
 	 * page_cgroup requires contiguous pages,
 	 * bigger than MAX_ORDER unless SPARSEMEM.
 	 */
 	page_cgroup_init_flatmem();
 	mem_init();
 	kmem_cache_init();
 	percpu_init_late();
 	pgtable_init();
 	vmalloc_init();
 }
 asmlinkage __visible void __init start_kernel(void)
 {
 	char *command_line;
 	char *after_dashes;
 	/*
 	 * Need to run as early as possible, to initialize the
 	 * lockdep hash:
 	 */
 	lockdep_init();
 	set_task_stack_end_magic(&init_task);
 	smp_setup_processor_id();
 	debug_objects_early_init();
 	/*
 	 * Set up the the initial canary ASAP:
 	 */
 	boot_init_stack_canary();
 	cgroup_init_early();
 	local_irq_disable();
 	early_boot_irqs_disabled = true;
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
 	boot_cpu_init();
 	page_address_init();
 	pr_notice("%s", linux_banner);
 	setup_arch(&command_line);
 	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 	build_all_zonelists(NULL, NULL);
 	page_alloc_init();
 	pr_notice("Kernel command line: %s\n", boot_command_line);
 	parse_early_param();
 	after_dashes = parse_args("Booting kernel",
 				  static_command_line, __start___param,
 				  __stop___param - __start___param,
 				  -1, -1, &unknown_bootoption);
 	if (after_dashes)
 		parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
 			   set_init_arg);
 	jump_label_init();
 	/*
 	 * These use large bootmem allocations and must precede
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
 	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
 	mm_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
 	 * time - but meanwhile we still have a functioning scheduler.
 	 */
 	sched_init();
 	/*
 	 * Disable preemption - early bootup scheduling is extremely
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
 	if (WARN(!irqs_disabled(),
 		 "Interrupts were enabled *very* early, fixing it\n"))
 		local_irq_disable();
 	idr_init_cache();
 	rcu_init();
 	context_tracking_init();
 	radix_tree_init();
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
 	tick_init();
 	rcu_init_nohz();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
 	perf_event_init();
 	profile_init();
 	call_function_init();
 	WARN(!irqs_disabled(), "Interrupts were enabled early\n");
 	early_boot_irqs_disabled = false;
 	local_irq_enable();
 	kmem_cache_init_late();
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
 	 * we've done PCI setups etc, and console_init() must be aware of
 	 * this. But we do want output early, in case something goes wrong.
 	 */
 	console_init();
 	if (panic_later)
 		panic("Too many boot %s vars at `%s'", panic_later,
 		      panic_param);
 	lockdep_info();
 	/*
 	 * Need to run this when irqs are enabled, because it wants
 	 * to self-test [hard/soft]-irqs on/off lock inversion bugs
 	 * too:
 	 */
 	locking_selftest();
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 	    page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
 		pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
 		    page_to_pfn(virt_to_page((void *)initrd_start)),
 		    min_low_pfn);
 		initrd_start = 0;
 	}
 #endif
 	page_cgroup_init();
 	debug_objects_mem_init();
 	kmemleak_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
 	sched_clock_init();
 	calibrate_delay();
 	pidmap_init();
 	anon_vma_init();
 	acpi_early_init();
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
 		efi_enter_virtual_mode();
 #endif
 #ifdef CONFIG_X86_ESPFIX64
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 #endif
 	thread_info_cache_init();
 	cred_init();
 	fork_init(totalram_pages);
 	proc_caches_init();
 	buffer_init();
 	key_init();
 	security_init();
 	dbg_late_init();
 	vfs_caches_init(totalram_pages);
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
 	proc_root_init();
+	nsfs_init();
 	cgroup_init();
 	cpuset_init();
 	taskstats_init_early();
 	delayacct_init();
 	check_bugs();
 	sfi_init_late();
 	if (efi_enabled(EFI_RUNTIME_SERVICES)) {
 		efi_late_init();
 		efi_free_boot_services();
 	}
 	ftrace_init();
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
 /* Call all constructor functions linked into the kernel. */
 static void __init do_ctors(void)
 {
 #ifdef CONFIG_CONSTRUCTORS
 	ctor_fn_t *fn = (ctor_fn_t *) __ctors_start;
 	for (; fn < (ctor_fn_t *) __ctors_end; fn++)
 		(*fn)();
 #endif
 }
 bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 #ifdef CONFIG_KALLSYMS
 struct blacklist_entry {
 	struct list_head next;
 	char *buf;
 };
 static __initdata_or_module LIST_HEAD(blacklisted_initcalls);
 static int __init initcall_blacklist(char *str)
 {
 	char *str_entry;
 	struct blacklist_entry *entry;
 	/* str argument is a comma-separated list of functions */
 	do {
 		str_entry = strsep(&str, ",");
 		if (str_entry) {
 			pr_debug("blacklisting initcall %s\n", str_entry);
 			entry = alloc_bootmem(sizeof(*entry));
 			entry->buf = alloc_bootmem(strlen(str_entry) + 1);
 			strcpy(entry->buf, str_entry);
 			list_add(&entry->next, &blacklisted_initcalls);
 		}
 	} while (str_entry);
 	return 0;
 }
 static bool __init_or_module initcall_blacklisted(initcall_t fn)
 {
 	struct list_head *tmp;
 	struct blacklist_entry *entry;
 	char *fn_name;
 	fn_name = kasprintf(GFP_KERNEL, "%pf", fn);
 	if (!fn_name)
 		return false;
 	list_for_each(tmp, &blacklisted_initcalls) {
 		entry = list_entry(tmp, struct blacklist_entry, next);
 		if (!strcmp(fn_name, entry->buf)) {
 			pr_debug("initcall %s blacklisted\n", fn_name);
 			kfree(fn_name);
 			return true;
 		}
 	}
 	kfree(fn_name);
 	return false;
 }
 #else
 static int __init initcall_blacklist(char *str)
 {
 	pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n");
 	return 0;
 }
 static bool __init_or_module initcall_blacklisted(initcall_t fn)
 {
 	return false;
 }
 #endif
 __setup("initcall_blacklist=", initcall_blacklist);
 static int __init_or_module do_one_initcall_debug(initcall_t fn)
 {
 	ktime_t calltime, delta, rettime;
 	unsigned long long duration;
 	int ret;
 	printk(KERN_DEBUG "calling  %pF @ %i\n", fn, task_pid_nr(current));
 	calltime = ktime_get();
 	ret = fn();
 	rettime = ktime_get();
 	delta = ktime_sub(rettime, calltime);
 	duration = (unsigned long long) ktime_to_ns(delta) >> 10;
 	printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
 		 fn, ret, duration);
 	return ret;
 }
 int __init_or_module do_one_initcall(initcall_t fn)
 {
 	int count = preempt_count();
 	int ret;
 	char msgbuf[64];
 	if (initcall_blacklisted(fn))
 		return -EPERM;
 	if (initcall_debug)
 		ret = do_one_initcall_debug(fn);
 	else
 		ret = fn();
 	msgbuf[0] = 0;
 	if (preempt_count() != count) {
 		sprintf(msgbuf, "preemption imbalance ");
 		preempt_count_set(count);
 	}
 	if (irqs_disabled()) {
 		strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
 		local_irq_enable();
 	}
 	WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
 	return ret;
 }
 extern initcall_t __initcall_start[];
 extern initcall_t __initcall0_start[];
 extern initcall_t __initcall1_start[];
 extern initcall_t __initcall2_start[];
 extern initcall_t __initcall3_start[];
 extern initcall_t __initcall4_start[];
 extern initcall_t __initcall5_start[];
 extern initcall_t __initcall6_start[];
 extern initcall_t __initcall7_start[];
 extern initcall_t __initcall_end[];
 static initcall_t *initcall_levels[] __initdata = {
 	__initcall0_start,
 	__initcall1_start,
 	__initcall2_start,
 	__initcall3_start,
 	__initcall4_start,
 	__initcall5_start,
 	__initcall6_start,
 	__initcall7_start,
 	__initcall_end,
 };
 /* Keep these in sync with initcalls in include/linux/init.h */
 static char *initcall_level_names[] __initdata = {
 	"early",
 	"core",
 	"postcore",
 	"arch",
 	"subsys",
 	"fs",
 	"device",
 	"late",
 };
 static void __init do_initcall_level(int level)
 {
 	initcall_t *fn;
 	strcpy(initcall_command_line, saved_command_line);
 	parse_args(initcall_level_names[level],
 		   initcall_command_line, __start___param,
 		   __stop___param - __start___param,
 		   level, level,
 		   &repair_env_string);
 	for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
 		do_one_initcall(*fn);
 }
 static void __init do_initcalls(void)
 {
 	int level;
 	for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
 		do_initcall_level(level);
 }
 /*
  * Ok, the machine is now initialized. None of the devices
  * have been touched yet, but the CPU subsystem is up and
  * running, and memory and process management works.
  *
  * Now we can finally start doing some real work..
  */
 static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
 	usermodehelper_init();
 	shmem_init();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
 	usermodehelper_enable();
 	do_initcalls();
 	random_int_secret_init();
 }
 static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *fn;
 	for (fn = __initcall_start; fn < __initcall0_start; fn++)
 		do_one_initcall(*fn);
 }
 /*
  * This function requests modules which should be loaded by default and is
  * called twice right after initrd is mounted and right before init is
  * exec'd.  If such modules are on either initrd or rootfs, they will be
  * loaded before control is passed to userland.
  */
 void __init load_default_modules(void)
 {
 	load_default_elevator_module();
 }
 static int run_init_process(const char *init_filename)
 {
 	argv_init[0] = init_filename;
 	return do_execve(getname_kernel(init_filename),
 		(const char __user *const __user *)argv_init,
 		(const char __user *const __user *)envp_init);
 }
 static int try_to_run_init_process(const char *init_filename)
 {
 	int ret;
 	ret = run_init_process(init_filename);
 	if (ret && ret != -ENOENT) {
 		pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
 		       init_filename, ret);
 	}
 	return ret;
 }
 static noinline void __init kernel_init_freeable(void);
 static int __ref kernel_init(void *unused)
 {
 	int ret;
 	kernel_init_freeable();
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	free_initmem();
 	mark_rodata_ro();
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 	flush_delayed_fput();
 	if (ramdisk_execute_command) {
 		ret = run_init_process(ramdisk_execute_command);
 		if (!ret)
 			return 0;
 		pr_err("Failed to execute %s (error %d)\n",
 		       ramdisk_execute_command, ret);
 	}
 	/*
 	 * We try each of these until one succeeds.
 	 *
 	 * The Bourne shell can be used instead of init if we are
 	 * trying to recover a really broken machine.
 	 */
 	if (execute_command) {
 		ret = run_init_process(execute_command);
 		if (!ret)
 			return 0;
 		pr_err("Failed to execute %s (error %d).  Attempting defaults...\n",
 			execute_command, ret);
 	}
 	if (!try_to_run_init_process("/sbin/init") ||
 	    !try_to_run_init_process("/etc/init") ||
 	    !try_to_run_init_process("/bin/init") ||
 	    !try_to_run_init_process("/bin/sh"))
 		return 0;
 	panic("No working init found.  Try passing init= option to kernel. "
 	      "See Linux Documentation/init.txt for guidance.");
 }
 static noinline void __init kernel_init_freeable(void)
 {
 	/*
 	 * Wait until kthreadd is all set-up.
 	 */
 	wait_for_completion(&kthreadd_done);
 	/* Now the scheduler is fully set up and can do blocking allocations */
 	gfp_allowed_mask = __GFP_BITS_MASK;
 	/*
 	 * init can allocate pages on any node
 	 */
 	set_mems_allowed(node_states[N_MEMORY]);
 	/*
 	 * init can run on any cpu.
 	 */
 	set_cpus_allowed_ptr(current, cpu_all_mask);
 	cad_pid = task_pid(current);
 	smp_prepare_cpus(setup_max_cpus);
 	do_pre_smp_initcalls();
 	lockup_detector_init();
 	smp_init();
 	sched_init_smp();
 	do_basic_setup();
 	/* Open the /dev/console on the rootfs, this should never fail */
 	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
 		pr_err("Warning: unable to open an initial console.\n");
 	(void) sys_dup(0);
 	(void) sys_dup(0);
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
 	 */
 	if (!ramdisk_execute_command)
 		ramdisk_execute_command = "/init";
 	if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
 	/* rootfs is available now, try loading default modules */
 	load_default_modules();
 }

take the targets of /proc/*/ns/* symlinks to separate fs

take the targets of /proc//ns/ symlinks to separate fs