Doug / smarc-fsl-linux-kernel

1

/* -*- mode: c; c-basic-offset: 8; -*-

1

/* -*- mode: c; c-basic-offset: 8; -*-

2

* vim: noexpandtab sw=8 ts=8 sts=0:

2

* vim: noexpandtab sw=8 ts=8 sts=0:

3

*

3

*

4

* file.c

4

* file.c

5

*

5

*

6

* File open, close, extend, truncate

6

* File open, close, extend, truncate

7

*

7

*

8

9

*

9

*

10

* This program is free software; you can redistribute it and/or

10

* This program is free software; you can redistribute it and/or

11

* modify it under the terms of the GNU General Public

11

* modify it under the terms of the GNU General Public

12

* License as published by the Free Software Foundation; either

12

* License as published by the Free Software Foundation; either

13

* version 2 of the License, or (at your option) any later version.

13

* version 2 of the License, or (at your option) any later version.

14

*

14

*

15

* This program is distributed in the hope that it will be useful,

15

* This program is distributed in the hope that it will be useful,

16

* but WITHOUT ANY WARRANTY; without even the implied warranty of

16

* but WITHOUT ANY WARRANTY; without even the implied warranty of

17

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

17

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

18

* General Public License for more details.

18

* General Public License for more details.

19

*

19

*

20

* You should have received a copy of the GNU General Public

20

* You should have received a copy of the GNU General Public

21

* License along with this program; if not, write to the

21

* License along with this program; if not, write to the

22

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

22

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

23

* Boston, MA 021110-1307, USA.

23

* Boston, MA 021110-1307, USA.

24

*/

24

*/

25

26

#include <linux/capability.h>

26

#include <linux/capability.h>

27

#include <linux/fs.h>

27

#include <linux/fs.h>

28

#include <linux/types.h>

28

#include <linux/types.h>

29

#include <linux/slab.h>

29

#include <linux/slab.h>

30

#include <linux/highmem.h>

30

#include <linux/highmem.h>

31

#include <linux/pagemap.h>

31

#include <linux/pagemap.h>

32

#include <linux/uio.h>

32

#include <linux/uio.h>

33

#include <linux/sched.h>

33

#include <linux/sched.h>

34

#include <linux/splice.h>

34

#include <linux/splice.h>

35

#include <linux/mount.h>

35

#include <linux/mount.h>

36

#include <linux/writeback.h>

36

#include <linux/writeback.h>

37

#include <linux/falloc.h>

37

#include <linux/falloc.h>

38

#include <linux/quotaops.h>

38

#include <linux/quotaops.h>

39

40

#define MLOG_MASK_PREFIX ML_INODE

40

#define MLOG_MASK_PREFIX ML_INODE

41

#include <cluster/masklog.h>

41

#include <cluster/masklog.h>

42

43

#include "ocfs2.h"

43

#include "ocfs2.h"

44

45

#include "alloc.h"

45

#include "alloc.h"

46

#include "aops.h"

46

#include "aops.h"

47

#include "dir.h"

47

#include "dir.h"

48

#include "dlmglue.h"

48

#include "dlmglue.h"

49

#include "extent_map.h"

49

#include "extent_map.h"

50

#include "file.h"

50

#include "file.h"

51

#include "sysfile.h"

51

#include "sysfile.h"

52

#include "inode.h"

52

#include "inode.h"

53

#include "ioctl.h"

53

#include "ioctl.h"

54

#include "journal.h"

54

#include "journal.h"

55

#include "locks.h"

55

#include "locks.h"

56

#include "mmap.h"

56

#include "mmap.h"

57

#include "suballoc.h"

57

#include "suballoc.h"

58

#include "super.h"

58

#include "super.h"

59

#include "xattr.h"

59

#include "xattr.h"

60

#include "acl.h"

60

#include "acl.h"

61

#include "quota.h"

61

#include "quota.h"

62

63

#include "buffer_head_io.h"

63

#include "buffer_head_io.h"

64

65

static int ocfs2_sync_inode(struct inode *inode)

65

static int ocfs2_sync_inode(struct inode *inode)

66

{

66

{

67

filemap_fdatawrite(inode->i_mapping);

67

filemap_fdatawrite(inode->i_mapping);

68

return sync_mapping_buffers(inode->i_mapping);

68

return sync_mapping_buffers(inode->i_mapping);

69

}

69

}

70

71

static int ocfs2_init_file_private(struct inode *inode, struct file *file)

71

static int ocfs2_init_file_private(struct inode *inode, struct file *file)

72

{

72

{

73

struct ocfs2_file_private *fp;

73

struct ocfs2_file_private *fp;

74

75

fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);

75

fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);

76

if (!fp)

76

if (!fp)

77

return -ENOMEM;

77

return -ENOMEM;

78

79

fp->fp_file = file;

79

fp->fp_file = file;

80

mutex_init(&fp->fp_mutex);

80

mutex_init(&fp->fp_mutex);

81

ocfs2_file_lock_res_init(&fp->fp_flock, fp);

81

ocfs2_file_lock_res_init(&fp->fp_flock, fp);

82

file->private_data = fp;

82

file->private_data = fp;

83

84

return 0;

84

return 0;

85

}

85

}

86

87

static void ocfs2_free_file_private(struct inode *inode, struct file *file)

87

static void ocfs2_free_file_private(struct inode *inode, struct file *file)

88

{

88

{

89

struct ocfs2_file_private *fp = file->private_data;

89

struct ocfs2_file_private *fp = file->private_data;

90

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

90

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

91

92

if (fp) {

92

if (fp) {

93

ocfs2_simple_drop_lockres(osb, &fp->fp_flock);

93

ocfs2_simple_drop_lockres(osb, &fp->fp_flock);

94

ocfs2_lock_res_free(&fp->fp_flock);

94

ocfs2_lock_res_free(&fp->fp_flock);

95

kfree(fp);

95

kfree(fp);

96

file->private_data = NULL;

96

file->private_data = NULL;

97

}

97

}

98

}

98

}

99

100

static int ocfs2_file_open(struct inode *inode, struct file *file)

100

static int ocfs2_file_open(struct inode *inode, struct file *file)

101

{

101

{

102

int status;

102

int status;

103

int mode = file->f_flags;

103

int mode = file->f_flags;

104

struct ocfs2_inode_info *oi = OCFS2_I(inode);

104

struct ocfs2_inode_info *oi = OCFS2_I(inode);

105

106

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

106

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

107

file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);

107

file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);

108

109

spin_lock(&oi->ip_lock);

109

spin_lock(&oi->ip_lock);

110

111

/* Check that the inode hasn't been wiped from disk by another

111

/* Check that the inode hasn't been wiped from disk by another

112

* node. If it hasn't then we're safe as long as we hold the

112

* node. If it hasn't then we're safe as long as we hold the

113

* spin lock until our increment of open count. */

113

* spin lock until our increment of open count. */

114

if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {

114

if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {

115

spin_unlock(&oi->ip_lock);

115

spin_unlock(&oi->ip_lock);

116

117

status = -ENOENT;

117

status = -ENOENT;

118

goto leave;

118

goto leave;

119

}

119

}

120

121

if (mode & O_DIRECT)

121

if (mode & O_DIRECT)

122

oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

122

oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

123

124

oi->ip_open_count++;

124

oi->ip_open_count++;

125

spin_unlock(&oi->ip_lock);

125

spin_unlock(&oi->ip_lock);

126

127

status = ocfs2_init_file_private(inode, file);

127

status = ocfs2_init_file_private(inode, file);

128

if (status) {

128

if (status) {

129

/*

129

/*

130

* We want to set open count back if we're failing the

130

* We want to set open count back if we're failing the

131

* open.

131

* open.

132

*/

132

*/

133

spin_lock(&oi->ip_lock);

133

spin_lock(&oi->ip_lock);

134

oi->ip_open_count--;

134

oi->ip_open_count--;

135

spin_unlock(&oi->ip_lock);

135

spin_unlock(&oi->ip_lock);

136

}

136

}

137

138

leave:

138

leave:

139

mlog_exit(status);

139

mlog_exit(status);

140

return status;

140

return status;

141

}

141

}

142

143

static int ocfs2_file_release(struct inode *inode, struct file *file)

143

static int ocfs2_file_release(struct inode *inode, struct file *file)

144

{

144

{

145

struct ocfs2_inode_info *oi = OCFS2_I(inode);

145

struct ocfs2_inode_info *oi = OCFS2_I(inode);

146

147

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

147

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

148

file->f_path.dentry->d_name.len,

148

file->f_path.dentry->d_name.len,

149

file->f_path.dentry->d_name.name);

149

file->f_path.dentry->d_name.name);

150

151

spin_lock(&oi->ip_lock);

151

spin_lock(&oi->ip_lock);

152

if (!--oi->ip_open_count)

152

if (!--oi->ip_open_count)

153

oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;

153

oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;

154

spin_unlock(&oi->ip_lock);

154

spin_unlock(&oi->ip_lock);

155

156

ocfs2_free_file_private(inode, file);

156

ocfs2_free_file_private(inode, file);

157

158

mlog_exit(0);

158

mlog_exit(0);

159

160

return 0;

160

return 0;

161

}

161

}

162

163

static int ocfs2_dir_open(struct inode *inode, struct file *file)

163

static int ocfs2_dir_open(struct inode *inode, struct file *file)

164

{

164

{

165

return ocfs2_init_file_private(inode, file);

165

return ocfs2_init_file_private(inode, file);

166

}

166

}

167

168

static int ocfs2_dir_release(struct inode *inode, struct file *file)

168

static int ocfs2_dir_release(struct inode *inode, struct file *file)

169

{

169

{

170

ocfs2_free_file_private(inode, file);

170

ocfs2_free_file_private(inode, file);

171

return 0;

171

return 0;

172

}

172

}

173

174

static int ocfs2_sync_file(struct file *file,

174

static int ocfs2_sync_file(struct file *file,

175

struct dentry *dentry,

175

struct dentry *dentry,

176

int datasync)

176

int datasync)

177

{

177

{

178

int err = 0;

178

int err = 0;

179

journal_t *journal;

179

journal_t *journal;

180

struct inode *inode = dentry->d_inode;

180

struct inode *inode = dentry->d_inode;

181

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

181

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

182

183

mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,

183

mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,

184

dentry->d_name.len, dentry->d_name.name);

184

dentry->d_name.len, dentry->d_name.name);

185

186

err = ocfs2_sync_inode(dentry->d_inode);

186

err = ocfs2_sync_inode(dentry->d_inode);

187

if (err)

187

if (err)

188

goto bail;

188

goto bail;

189

190

journal = osb->journal->j_journal;

190

journal = osb->journal->j_journal;

191

err = jbd2_journal_force_commit(journal);

191

err = jbd2_journal_force_commit(journal);

192

193

bail:

193

bail:

194

mlog_exit(err);

194

mlog_exit(err);

195

196

return (err < 0) ? -EIO : 0;

196

return (err < 0) ? -EIO : 0;

197

}

197

}

198

199

int ocfs2_should_update_atime(struct inode *inode,

199

int ocfs2_should_update_atime(struct inode *inode,

200

struct vfsmount *vfsmnt)

200

struct vfsmount *vfsmnt)

201

{

201

{

202

struct timespec now;

202

struct timespec now;

203

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

203

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

204

205

if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))

205

if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))

206

return 0;

206

return 0;

207

208

if ((inode->i_flags & S_NOATIME) ||

208

if ((inode->i_flags & S_NOATIME) ||

209

((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))

209

((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))

210

return 0;

210

return 0;

211

212

/*

212

/*

213

* We can be called with no vfsmnt structure - NFSD will

213

* We can be called with no vfsmnt structure - NFSD will

214

* sometimes do this.

214

* sometimes do this.

215

*

215

*

216

* Note that our action here is different than touch_atime() -

216

* Note that our action here is different than touch_atime() -

217

* if we can't tell whether this is a noatime mount, then we

217

* if we can't tell whether this is a noatime mount, then we

218

* don't know whether to trust the value of s_atime_quantum.

218

* don't know whether to trust the value of s_atime_quantum.

219

*/

219

*/

220

if (vfsmnt == NULL)

220

if (vfsmnt == NULL)

221

return 0;

221

return 0;

222

223

if ((vfsmnt->mnt_flags & MNT_NOATIME) ||

223

if ((vfsmnt->mnt_flags & MNT_NOATIME) ||

224

((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))

224

((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))

225

return 0;

225

return 0;

226

227

if (vfsmnt->mnt_flags & MNT_RELATIME) {

227

if (vfsmnt->mnt_flags & MNT_RELATIME) {

228

if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||

228

if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||

229

(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))

229

(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))

230

return 1;

230

return 1;

231

232

return 0;

232

return 0;

233

}

233

}

234

235

now = CURRENT_TIME;

235

now = CURRENT_TIME;

236

if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))

236

if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))

237

return 0;

237

return 0;

238

else

238

else

239

return 1;

239

return 1;

240

}

240

}

241

242

int ocfs2_update_inode_atime(struct inode *inode,

242

int ocfs2_update_inode_atime(struct inode *inode,

243

struct buffer_head *bh)

243

struct buffer_head *bh)

244

{

244

{

245

int ret;

245

int ret;

246

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

246

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

247

handle_t *handle;

247

handle_t *handle;

248

struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;

248

struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;

249

250

mlog_entry_void();

250

mlog_entry_void();

251

252

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

252

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

253

if (IS_ERR(handle)) {

253

if (IS_ERR(handle)) {

254

ret = PTR_ERR(handle);

254

ret = PTR_ERR(handle);

255

mlog_errno(ret);

255

mlog_errno(ret);

256

goto out;

256

goto out;

257

}

257

}

258

259

ret = ocfs2_journal_access_di(handle, inode, bh,

259

ret = ocfs2_journal_access_di(handle, inode, bh,

260

OCFS2_JOURNAL_ACCESS_WRITE);

260

OCFS2_JOURNAL_ACCESS_WRITE);

261

if (ret) {

261

if (ret) {

262

mlog_errno(ret);

262

mlog_errno(ret);

263

goto out_commit;

263

goto out_commit;

264

}

264

}

265

266

/*

266

/*

267

* Don't use ocfs2_mark_inode_dirty() here as we don't always

267

* Don't use ocfs2_mark_inode_dirty() here as we don't always

268

* have i_mutex to guard against concurrent changes to other

268

* have i_mutex to guard against concurrent changes to other

269

* inode fields.

269

* inode fields.

270

*/

270

*/

271

inode->i_atime = CURRENT_TIME;

271

inode->i_atime = CURRENT_TIME;

272

di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);

272

di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);

273

di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);

273

di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);

274

275

ret = ocfs2_journal_dirty(handle, bh);

275

ret = ocfs2_journal_dirty(handle, bh);

276

if (ret < 0)

276

if (ret < 0)

277

mlog_errno(ret);

277

mlog_errno(ret);

278

279

out_commit:

279

out_commit:

280

ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);

280

ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);

281

out:

281

out:

282

mlog_exit(ret);

282

mlog_exit(ret);

283

return ret;

283

return ret;

284

}

284

}

285

286

static int ocfs2_set_inode_size(handle_t *handle,

286

static int ocfs2_set_inode_size(handle_t *handle,

287

struct inode *inode,

287

struct inode *inode,

288

struct buffer_head *fe_bh,

288

struct buffer_head *fe_bh,

289

u64 new_i_size)

289

u64 new_i_size)

290

{

290

{

291

int status;

291

int status;

292

293

mlog_entry_void();

293

mlog_entry_void();

294

i_size_write(inode, new_i_size);

294

i_size_write(inode, new_i_size);

295

inode->i_blocks = ocfs2_inode_sector_count(inode);

295

inode->i_blocks = ocfs2_inode_sector_count(inode);

296

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

296

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

297

298

status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);

298

status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);

299

if (status < 0) {

299

if (status < 0) {

300

mlog_errno(status);

300

mlog_errno(status);

301

goto bail;

301

goto bail;

302

}

302

}

303

304

bail:

304

bail:

305

mlog_exit(status);

305

mlog_exit(status);

306

return status;

306

return status;

307

}

307

}

308

309

int ocfs2_simple_size_update(struct inode *inode,

309

int ocfs2_simple_size_update(struct inode *inode,

310

struct buffer_head *di_bh,

310

struct buffer_head *di_bh,

311

u64 new_i_size)

311

u64 new_i_size)

312

{

312

{

313

int ret;

313

int ret;

314

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

314

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

315

handle_t *handle = NULL;

315

handle_t *handle = NULL;

316

317

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

317

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

318

if (IS_ERR(handle)) {

318

if (IS_ERR(handle)) {

319

ret = PTR_ERR(handle);

319

ret = PTR_ERR(handle);

320

mlog_errno(ret);

320

mlog_errno(ret);

321

goto out;

321

goto out;

322

}

322

}

323

324

ret = ocfs2_set_inode_size(handle, inode, di_bh,

324

ret = ocfs2_set_inode_size(handle, inode, di_bh,

325

new_i_size);

325

new_i_size);

326

if (ret < 0)

326

if (ret < 0)

327

mlog_errno(ret);

327

mlog_errno(ret);

328

329

ocfs2_commit_trans(osb, handle);

329

ocfs2_commit_trans(osb, handle);

330

out:

330

out:

331

return ret;

331

return ret;

332

}

332

}

333

334

static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,

334

static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,

335

struct inode *inode,

335

struct inode *inode,

336

struct buffer_head *fe_bh,

336

struct buffer_head *fe_bh,

337

u64 new_i_size)

337

u64 new_i_size)

338

{

338

{

339

int status;

339

int status;

340

handle_t *handle;

340

handle_t *handle;

341

struct ocfs2_dinode *di;

341

struct ocfs2_dinode *di;

342

u64 cluster_bytes;

342

u64 cluster_bytes;

343

344

mlog_entry_void();

344

mlog_entry_void();

345

346

/* TODO: This needs to actually orphan the inode in this

346

/* TODO: This needs to actually orphan the inode in this

347

* transaction. */

347

* transaction. */

348

349

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

349

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

350

if (IS_ERR(handle)) {

350

if (IS_ERR(handle)) {

351

status = PTR_ERR(handle);

351

status = PTR_ERR(handle);

352

mlog_errno(status);

352

mlog_errno(status);

353

goto out;

353

goto out;

354

}

354

}

355

356

status = ocfs2_journal_access_di(handle, inode, fe_bh,

356

status = ocfs2_journal_access_di(handle, inode, fe_bh,

357

OCFS2_JOURNAL_ACCESS_WRITE);

357

OCFS2_JOURNAL_ACCESS_WRITE);

358

if (status < 0) {

358

if (status < 0) {

359

mlog_errno(status);

359

mlog_errno(status);

360

goto out_commit;

360

goto out_commit;

361

}

361

}

362

363

/*

363

/*

364

* Do this before setting i_size.

364

* Do this before setting i_size.

365

*/

365

*/

366

cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);

366

cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);

367

status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,

367

status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,

368

cluster_bytes);

368

cluster_bytes);

369

if (status) {

369

if (status) {

370

mlog_errno(status);

370

mlog_errno(status);

371

goto out_commit;

371

goto out_commit;

372

}

372

}

373

374

i_size_write(inode, new_i_size);

374

i_size_write(inode, new_i_size);

375

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

375

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

376

377

di = (struct ocfs2_dinode *) fe_bh->b_data;

377

di = (struct ocfs2_dinode *) fe_bh->b_data;

378

di->i_size = cpu_to_le64(new_i_size);

378

di->i_size = cpu_to_le64(new_i_size);

379

di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);

379

di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);

380

di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);

380

di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);

381

382

status = ocfs2_journal_dirty(handle, fe_bh);

382

status = ocfs2_journal_dirty(handle, fe_bh);

383

if (status < 0)

383

if (status < 0)

384

mlog_errno(status);

384

mlog_errno(status);

385

386

out_commit:

386

out_commit:

387

ocfs2_commit_trans(osb, handle);

387

ocfs2_commit_trans(osb, handle);

388

out:

388

out:

389

390

mlog_exit(status);

390

mlog_exit(status);

391

return status;

391

return status;

392

}

392

}

393

394

static int ocfs2_truncate_file(struct inode *inode,

394

static int ocfs2_truncate_file(struct inode *inode,

395

struct buffer_head *di_bh,

395

struct buffer_head *di_bh,

396

u64 new_i_size)

396

u64 new_i_size)

397

{

397

{

398

int status = 0;

398

int status = 0;

399

struct ocfs2_dinode *fe = NULL;

399

struct ocfs2_dinode *fe = NULL;

400

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

400

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

401

struct ocfs2_truncate_context *tc = NULL;

401

struct ocfs2_truncate_context *tc = NULL;

402

403

mlog_entry("(inode = %llu, new_i_size = %llu\n",

403

mlog_entry("(inode = %llu, new_i_size = %llu\n",

404

(unsigned long long)OCFS2_I(inode)->ip_blkno,

404

(unsigned long long)OCFS2_I(inode)->ip_blkno,

405

(unsigned long long)new_i_size);

405

(unsigned long long)new_i_size);

406

407

/* We trust di_bh because it comes from ocfs2_inode_lock(), which

407

/* We trust di_bh because it comes from ocfs2_inode_lock(), which

408

* already validated it */

408

* already validated it */

409

fe = (struct ocfs2_dinode *) di_bh->b_data;

409

fe = (struct ocfs2_dinode *) di_bh->b_data;

410

411

mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),

411

mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),

412

"Inode %llu, inode i_size = %lld != di "

412

"Inode %llu, inode i_size = %lld != di "

413

"i_size = %llu, i_flags = 0x%x\n",

413

"i_size = %llu, i_flags = 0x%x\n",

414

(unsigned long long)OCFS2_I(inode)->ip_blkno,

414

(unsigned long long)OCFS2_I(inode)->ip_blkno,

415

i_size_read(inode),

415

i_size_read(inode),

416

(unsigned long long)le64_to_cpu(fe->i_size),

416

(unsigned long long)le64_to_cpu(fe->i_size),

417

le32_to_cpu(fe->i_flags));

417

le32_to_cpu(fe->i_flags));

418

419

if (new_i_size > le64_to_cpu(fe->i_size)) {

419

if (new_i_size > le64_to_cpu(fe->i_size)) {

420

mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",

420

mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",

421

(unsigned long long)le64_to_cpu(fe->i_size),

421

(unsigned long long)le64_to_cpu(fe->i_size),

422

(unsigned long long)new_i_size);

422

(unsigned long long)new_i_size);

423

status = -EINVAL;

423

status = -EINVAL;

424

mlog_errno(status);

424

mlog_errno(status);

425

goto bail;

425

goto bail;

426

}

426

}

427

428

mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",

428

mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",

429

(unsigned long long)le64_to_cpu(fe->i_blkno),

429

(unsigned long long)le64_to_cpu(fe->i_blkno),

430

(unsigned long long)le64_to_cpu(fe->i_size),

430

(unsigned long long)le64_to_cpu(fe->i_size),

431

(unsigned long long)new_i_size);

431

(unsigned long long)new_i_size);

432

433

/* lets handle the simple truncate cases before doing any more

433

/* lets handle the simple truncate cases before doing any more

434

* cluster locking. */

434

* cluster locking. */

435

if (new_i_size == le64_to_cpu(fe->i_size))

435

if (new_i_size == le64_to_cpu(fe->i_size))

436

goto bail;

436

goto bail;

437

438

down_write(&OCFS2_I(inode)->ip_alloc_sem);

438

down_write(&OCFS2_I(inode)->ip_alloc_sem);

439

440

/*

440

/*

441

* The inode lock forced other nodes to sync and drop their

441

* The inode lock forced other nodes to sync and drop their

442

* pages, which (correctly) happens even if we have a truncate

442

* pages, which (correctly) happens even if we have a truncate

443

* without allocation change - ocfs2 cluster sizes can be much

443

* without allocation change - ocfs2 cluster sizes can be much

444

* greater than page size, so we have to truncate them

444

* greater than page size, so we have to truncate them

445

* anyway.

445

* anyway.

446

*/

446

*/

447

unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);

447

unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);

448

truncate_inode_pages(inode->i_mapping, new_i_size);

448

truncate_inode_pages(inode->i_mapping, new_i_size);

449

450

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

450

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

451

status = ocfs2_truncate_inline(inode, di_bh, new_i_size,

451

status = ocfs2_truncate_inline(inode, di_bh, new_i_size,

452

i_size_read(inode), 1);

452

i_size_read(inode), 1);

453

if (status)

453

if (status)

454

mlog_errno(status);

454

mlog_errno(status);

455

456

goto bail_unlock_sem;

456

goto bail_unlock_sem;

457

}

457

}

458

459

/* alright, we're going to need to do a full blown alloc size

459

/* alright, we're going to need to do a full blown alloc size

460

* change. Orphan the inode so that recovery can complete the

460

* change. Orphan the inode so that recovery can complete the

461

* truncate if necessary. This does the task of marking

461

* truncate if necessary. This does the task of marking

462

* i_size. */

462

* i_size. */

463

status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);

463

status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);

464

if (status < 0) {

464

if (status < 0) {

465

mlog_errno(status);

465

mlog_errno(status);

466

goto bail_unlock_sem;

466

goto bail_unlock_sem;

467

}

467

}

468

469

status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);

469

status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);

470

if (status < 0) {

470

if (status < 0) {

471

mlog_errno(status);

471

mlog_errno(status);

472

goto bail_unlock_sem;

472

goto bail_unlock_sem;

473

}

473

}

474

475

status = ocfs2_commit_truncate(osb, inode, di_bh, tc);

475

status = ocfs2_commit_truncate(osb, inode, di_bh, tc);

476

if (status < 0) {

476

if (status < 0) {

477

mlog_errno(status);

477

mlog_errno(status);

478

goto bail_unlock_sem;

478

goto bail_unlock_sem;

479

}

479

}

480

481

/* TODO: orphan dir cleanup here. */

481

/* TODO: orphan dir cleanup here. */

482

bail_unlock_sem:

482

bail_unlock_sem:

483

up_write(&OCFS2_I(inode)->ip_alloc_sem);

483

up_write(&OCFS2_I(inode)->ip_alloc_sem);

484

485

bail:

485

bail:

486

487

mlog_exit(status);

487

mlog_exit(status);

488

return status;

488

return status;

489

}

489

}

490

491

/*

491

/*

492

* extend file allocation only here.

492

* extend file allocation only here.

493

* we'll update all the disk stuff, and oip->alloc_size

493

* we'll update all the disk stuff, and oip->alloc_size

494

*

494

*

495

* expect stuff to be locked, a transaction started and enough data /

495

* expect stuff to be locked, a transaction started and enough data /

496

* metadata reservations in the contexts.

496

* metadata reservations in the contexts.

497

*

497

*

498

* Will return -EAGAIN, and a reason if a restart is needed.

498

* Will return -EAGAIN, and a reason if a restart is needed.

499

* If passed in, *reason will always be set, even in error.

499

* If passed in, *reason will always be set, even in error.

500

*/

500

*/

501

int ocfs2_add_inode_data(struct ocfs2_super *osb,

501

int ocfs2_add_inode_data(struct ocfs2_super *osb,

502

struct inode *inode,

502

struct inode *inode,

503

u32 *logical_offset,

503

u32 *logical_offset,

504

u32 clusters_to_add,

504

u32 clusters_to_add,

505

int mark_unwritten,

505

int mark_unwritten,

506

struct buffer_head *fe_bh,

506

struct buffer_head *fe_bh,

507

handle_t *handle,

507

handle_t *handle,

508

struct ocfs2_alloc_context *data_ac,

508

struct ocfs2_alloc_context *data_ac,

509

struct ocfs2_alloc_context *meta_ac,

509

struct ocfs2_alloc_context *meta_ac,

510

enum ocfs2_alloc_restarted *reason_ret)

510

enum ocfs2_alloc_restarted *reason_ret)

511

{

511

{

512

int ret;

512

int ret;

513

struct ocfs2_extent_tree et;

513

struct ocfs2_extent_tree et;

514

515

ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);

515

ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);

516

ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,

516

ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,

517

clusters_to_add, mark_unwritten,

517

clusters_to_add, mark_unwritten,

518

&et, handle,

518

&et, handle,

519

data_ac, meta_ac, reason_ret);

519

data_ac, meta_ac, reason_ret);

520

521

return ret;

521

return ret;

522

}

522

}

523

524

static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,

524

static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,

525

u32 clusters_to_add, int mark_unwritten)

525

u32 clusters_to_add, int mark_unwritten)

526

{

526

{

527

int status = 0;

527

int status = 0;

528

int restart_func = 0;

528

int restart_func = 0;

529

int credits;

529

int credits;

530

u32 prev_clusters;

530

u32 prev_clusters;

531

struct buffer_head *bh = NULL;

531

struct buffer_head *bh = NULL;

532

struct ocfs2_dinode *fe = NULL;

532

struct ocfs2_dinode *fe = NULL;

533

handle_t *handle = NULL;

533

handle_t *handle = NULL;

534

struct ocfs2_alloc_context *data_ac = NULL;

534

struct ocfs2_alloc_context *data_ac = NULL;

535

struct ocfs2_alloc_context *meta_ac = NULL;

535

struct ocfs2_alloc_context *meta_ac = NULL;

536

enum ocfs2_alloc_restarted why;

536

enum ocfs2_alloc_restarted why;

537

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

537

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

538

struct ocfs2_extent_tree et;

538

struct ocfs2_extent_tree et;

539

int did_quota = 0;

539

int did_quota = 0;

540

541

mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

541

mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

542

543

/*

543

/*

544

* This function only exists for file systems which don't

544

* This function only exists for file systems which don't

545

* support holes.

545

* support holes.

546

*/

546

*/

547

BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));

547

BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));

548

549

status = ocfs2_read_inode_block(inode, &bh);

549

status = ocfs2_read_inode_block(inode, &bh);

550

if (status < 0) {

550

if (status < 0) {

551

mlog_errno(status);

551

mlog_errno(status);

552

goto leave;

552

goto leave;

553

}

553

}

554

fe = (struct ocfs2_dinode *) bh->b_data;

554

fe = (struct ocfs2_dinode *) bh->b_data;

555

556

restart_all:

556

restart_all:

557

BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

557

BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

558

559

mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "

559

mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "

560

"clusters_to_add = %u\n",

560

"clusters_to_add = %u\n",

561

(unsigned long long)OCFS2_I(inode)->ip_blkno,

561

(unsigned long long)OCFS2_I(inode)->ip_blkno,

562

(long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),

562

(long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),

563

clusters_to_add);

563

clusters_to_add);

564

ocfs2_init_dinode_extent_tree(&et, inode, bh);

564

ocfs2_init_dinode_extent_tree(&et, inode, bh);

565

status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,

565

status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,

566

&data_ac, &meta_ac);

566

&data_ac, &meta_ac);

567

if (status) {

567

if (status) {

568

mlog_errno(status);

568

mlog_errno(status);

569

goto leave;

569

goto leave;

570

}

570

}

571

572

credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,

572

credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,

573

clusters_to_add);

573

clusters_to_add);

574

handle = ocfs2_start_trans(osb, credits);

574

handle = ocfs2_start_trans(osb, credits);

575

if (IS_ERR(handle)) {

575

if (IS_ERR(handle)) {

576

status = PTR_ERR(handle);

576

status = PTR_ERR(handle);

577

handle = NULL;

577

handle = NULL;

578

mlog_errno(status);

578

mlog_errno(status);

579

goto leave;

579

goto leave;

580

}

580

}

581

582

restarted_transaction:

582

restarted_transaction:

583

if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,

583

if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,

584

clusters_to_add))) {

584

clusters_to_add))) {

585

status = -EDQUOT;

585

status = -EDQUOT;

586

goto leave;

586

goto leave;

587

}

587

}

588

did_quota = 1;

588

did_quota = 1;

589

590

/* reserve a write to the file entry early on - that we if we

590

/* reserve a write to the file entry early on - that we if we

591

* run out of credits in the allocation path, we can still

591

* run out of credits in the allocation path, we can still

592

* update i_size. */

592

* update i_size. */

593

status = ocfs2_journal_access_di(handle, inode, bh,

593

status = ocfs2_journal_access_di(handle, inode, bh,

594

OCFS2_JOURNAL_ACCESS_WRITE);

594

OCFS2_JOURNAL_ACCESS_WRITE);

595

if (status < 0) {

595

if (status < 0) {

596

mlog_errno(status);

596

mlog_errno(status);

597

goto leave;

597

goto leave;

598

}

598

}

599

600

prev_clusters = OCFS2_I(inode)->ip_clusters;

600

prev_clusters = OCFS2_I(inode)->ip_clusters;

601

602

status = ocfs2_add_inode_data(osb,

602

status = ocfs2_add_inode_data(osb,

603

inode,

603

inode,

604

&logical_start,

604

&logical_start,

605

clusters_to_add,

605

clusters_to_add,

606

mark_unwritten,

606

mark_unwritten,

607

bh,

607

bh,

608

handle,

608

handle,

609

data_ac,

609

data_ac,

610

meta_ac,

610

meta_ac,

611

&why);

611

&why);

612

if ((status < 0) && (status != -EAGAIN)) {

612

if ((status < 0) && (status != -EAGAIN)) {

613

if (status != -ENOSPC)

613

if (status != -ENOSPC)

614

mlog_errno(status);

614

mlog_errno(status);

615

goto leave;

615

goto leave;

616

}

616

}

617

618

status = ocfs2_journal_dirty(handle, bh);

618

status = ocfs2_journal_dirty(handle, bh);

619

if (status < 0) {

619

if (status < 0) {

620

mlog_errno(status);

620

mlog_errno(status);

621

goto leave;

621

goto leave;

622

}

622

}

623

624

spin_lock(&OCFS2_I(inode)->ip_lock);

624

spin_lock(&OCFS2_I(inode)->ip_lock);

625

clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);

625

clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);

626

spin_unlock(&OCFS2_I(inode)->ip_lock);

626

spin_unlock(&OCFS2_I(inode)->ip_lock);

627

/* Release unused quota reservation */

627

/* Release unused quota reservation */

628

vfs_dq_free_space(inode,

628

vfs_dq_free_space(inode,

629

ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));

629

ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));

630

did_quota = 0;

630

did_quota = 0;

631

632

if (why != RESTART_NONE && clusters_to_add) {

632

if (why != RESTART_NONE && clusters_to_add) {

633

if (why == RESTART_META) {

633

if (why == RESTART_META) {

634

mlog(0, "restarting function.\n");

634

mlog(0, "restarting function.\n");

635

restart_func = 1;

635

restart_func = 1;

636

} else {

636

} else {

637

BUG_ON(why != RESTART_TRANS);

637

BUG_ON(why != RESTART_TRANS);

638

639

mlog(0, "restarting transaction.\n");

639

mlog(0, "restarting transaction.\n");

640

/* TODO: This can be more intelligent. */

640

/* TODO: This can be more intelligent. */

641

credits = ocfs2_calc_extend_credits(osb->sb,

641

credits = ocfs2_calc_extend_credits(osb->sb,

642

&fe->id2.i_list,

642

&fe->id2.i_list,

643

clusters_to_add);

643

clusters_to_add);

644

status = ocfs2_extend_trans(handle, credits);

644

status = ocfs2_extend_trans(handle, credits);

645

if (status < 0) {

645

if (status < 0) {

646

/* handle still has to be committed at

646

/* handle still has to be committed at

647

* this point. */

647

* this point. */

648

status = -ENOMEM;

648

status = -ENOMEM;

649

mlog_errno(status);

649

mlog_errno(status);

650

goto leave;

650

goto leave;

651

}

651

}

652

goto restarted_transaction;

652

goto restarted_transaction;

653

}

653

}

654

}

654

}

655

656

mlog(0, "fe: i_clusters = %u, i_size=%llu\n",

656

mlog(0, "fe: i_clusters = %u, i_size=%llu\n",

657

le32_to_cpu(fe->i_clusters),

657

le32_to_cpu(fe->i_clusters),

658

(unsigned long long)le64_to_cpu(fe->i_size));

658

(unsigned long long)le64_to_cpu(fe->i_size));

659

mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",

659

mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",

660

OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));

660

OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));

661

662

leave:

662

leave:

663

if (status < 0 && did_quota)

663

if (status < 0 && did_quota)

664

vfs_dq_free_space(inode,

664

vfs_dq_free_space(inode,

665

ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));

665

ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));

666

if (handle) {

666

if (handle) {

667

ocfs2_commit_trans(osb, handle);

667

ocfs2_commit_trans(osb, handle);

668

handle = NULL;

668

handle = NULL;

669

}

669

}

670

if (data_ac) {

670

if (data_ac) {

671

ocfs2_free_alloc_context(data_ac);

671

ocfs2_free_alloc_context(data_ac);

672

data_ac = NULL;

672

data_ac = NULL;

673

}

673

}

674

if (meta_ac) {

674

if (meta_ac) {

675

ocfs2_free_alloc_context(meta_ac);

675

ocfs2_free_alloc_context(meta_ac);

676

meta_ac = NULL;

676

meta_ac = NULL;

677

}

677

}

678

if ((!status) && restart_func) {

678

if ((!status) && restart_func) {

679

restart_func = 0;

679

restart_func = 0;

680

goto restart_all;

680

goto restart_all;

681

}

681

}

682

brelse(bh);

682

brelse(bh);

683

bh = NULL;

683

bh = NULL;

684

685

mlog_exit(status);

685

mlog_exit(status);

686

return status;

686

return status;

687

}

687

}

688

689

/* Some parts of this taken from generic_cont_expand, which turned out

689

/* Some parts of this taken from generic_cont_expand, which turned out

690

* to be too fragile to do exactly what we need without us having to

690

* to be too fragile to do exactly what we need without us having to

691

* worry about recursive locking in ->write_begin() and ->write_end(). */

691

* worry about recursive locking in ->write_begin() and ->write_end(). */

692

static int ocfs2_write_zero_page(struct inode *inode,

692

static int ocfs2_write_zero_page(struct inode *inode,

693

u64 size)

693

u64 size)

694

{

694

{

695

struct address_space *mapping = inode->i_mapping;

695

struct address_space *mapping = inode->i_mapping;

696

struct page *page;

696

struct page *page;

697

unsigned long index;

697

unsigned long index;

698

unsigned int offset;

698

unsigned int offset;

699

handle_t *handle = NULL;

699

handle_t *handle = NULL;

700

int ret;

700

int ret;

701

702

offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */

702

offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */

703

/* ugh. in prepare/commit_write, if from==to==start of block, we

703

/* ugh. in prepare/commit_write, if from==to==start of block, we

704

** skip the prepare. make sure we never send an offset for the start

704

** skip the prepare. make sure we never send an offset for the start

705

** of a block

705

** of a block

706

*/

706

*/

707

if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {

707

if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {

708

offset++;

708

offset++;

709

}

709

}

710

index = size >> PAGE_CACHE_SHIFT;

710

index = size >> PAGE_CACHE_SHIFT;

711

712

page = grab_cache_page(mapping, index);

712

page = grab_cache_page(mapping, index);

713

if (!page) {

713

if (!page) {

714

ret = -ENOMEM;

714

ret = -ENOMEM;

715

mlog_errno(ret);

715

mlog_errno(ret);

716

goto out;

716

goto out;

717

}

717

}

718

719

ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);

719

ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);

720

if (ret < 0) {

720

if (ret < 0) {

721

mlog_errno(ret);

721

mlog_errno(ret);

722

goto out_unlock;

722

goto out_unlock;

723

}

723

}

724

725

if (ocfs2_should_order_data(inode)) {

725

if (ocfs2_should_order_data(inode)) {

726

handle = ocfs2_start_walk_page_trans(inode, page, offset,

726

handle = ocfs2_start_walk_page_trans(inode, page, offset,

727

offset);

727

offset);

728

if (IS_ERR(handle)) {

728

if (IS_ERR(handle)) {

729

ret = PTR_ERR(handle);

729

ret = PTR_ERR(handle);

730

handle = NULL;

730

handle = NULL;

731

goto out_unlock;

731

goto out_unlock;

732

}

732

}

733

}

733

}

734

735

/* must not update i_size! */

735

/* must not update i_size! */

736

ret = block_commit_write(page, offset, offset);

736

ret = block_commit_write(page, offset, offset);

737

if (ret < 0)

737

if (ret < 0)

738

mlog_errno(ret);

738

mlog_errno(ret);

739

else

739

else

740

ret = 0;

740

ret = 0;

741

742

if (handle)

742

if (handle)

743

ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);

743

ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);

744

out_unlock:

744

out_unlock:

745

unlock_page(page);

745

unlock_page(page);

746

page_cache_release(page);

746

page_cache_release(page);

747

out:

747

out:

748

return ret;

748

return ret;

749

}

749

}

750

751

static int ocfs2_zero_extend(struct inode *inode,

751

static int ocfs2_zero_extend(struct inode *inode,

752

u64 zero_to_size)

752

u64 zero_to_size)

753

{

753

{

754

int ret = 0;

754

int ret = 0;

755

u64 start_off;

755

u64 start_off;

756

struct super_block *sb = inode->i_sb;

756

struct super_block *sb = inode->i_sb;

757

758

start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));

758

start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));

759

while (start_off < zero_to_size) {

759

while (start_off < zero_to_size) {

760

ret = ocfs2_write_zero_page(inode, start_off);

760

ret = ocfs2_write_zero_page(inode, start_off);

761

if (ret < 0) {

761

if (ret < 0) {

762

mlog_errno(ret);

762

mlog_errno(ret);

763

goto out;

763

goto out;

764

}

764

}

765

766

start_off += sb->s_blocksize;

766

start_off += sb->s_blocksize;

767

768

/*

768

/*

769

* Very large extends have the potential to lock up

769

* Very large extends have the potential to lock up

770

* the cpu for extended periods of time.

770

* the cpu for extended periods of time.

771

*/

771

*/

772

cond_resched();

772

cond_resched();

773

}

773

}

774

775

out:

775

out:

776

return ret;

776

return ret;

777

}

777

}

778

779

int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)

779

int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)

780

{

780

{

781

int ret;

781

int ret;

782

u32 clusters_to_add;

782

u32 clusters_to_add;

783

struct ocfs2_inode_info *oi = OCFS2_I(inode);

783

struct ocfs2_inode_info *oi = OCFS2_I(inode);

784

785

clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);

785

clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);

786

if (clusters_to_add < oi->ip_clusters)

786

if (clusters_to_add < oi->ip_clusters)

787

clusters_to_add = 0;

787

clusters_to_add = 0;

788

else

788

else

789

clusters_to_add -= oi->ip_clusters;

789

clusters_to_add -= oi->ip_clusters;

790

791

if (clusters_to_add) {

791

if (clusters_to_add) {

792

ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,

792

ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,

793

clusters_to_add, 0);

793

clusters_to_add, 0);

794

if (ret) {

794

if (ret) {

795

mlog_errno(ret);

795

mlog_errno(ret);

796

goto out;

796

goto out;

797

}

797

}

798

}

798

}

799

800

/*

800

/*

801

* Call this even if we don't add any clusters to the tree. We

801

* Call this even if we don't add any clusters to the tree. We

802

* still need to zero the area between the old i_size and the

802

* still need to zero the area between the old i_size and the

803

* new i_size.

803

* new i_size.

804

*/

804

*/

805

ret = ocfs2_zero_extend(inode, zero_to);

805

ret = ocfs2_zero_extend(inode, zero_to);

806

if (ret < 0)

806

if (ret < 0)

807

mlog_errno(ret);

807

mlog_errno(ret);

808

809

out:

809

out:

810

return ret;

810

return ret;

811

}

811

}

812

813

static int ocfs2_extend_file(struct inode *inode,

813

static int ocfs2_extend_file(struct inode *inode,

814

struct buffer_head *di_bh,

814

struct buffer_head *di_bh,

815

u64 new_i_size)

815

u64 new_i_size)

816

{

816

{

817

int ret = 0;

817

int ret = 0;

818

struct ocfs2_inode_info *oi = OCFS2_I(inode);

818

struct ocfs2_inode_info *oi = OCFS2_I(inode);

819

820

BUG_ON(!di_bh);

820

BUG_ON(!di_bh);

821

822

/* setattr sometimes calls us like this. */

822

/* setattr sometimes calls us like this. */

823

if (new_i_size == 0)

823

if (new_i_size == 0)

824

goto out;

824

goto out;

825

826

if (i_size_read(inode) == new_i_size)

826

if (i_size_read(inode) == new_i_size)

827

goto out;

827

goto out;

828

BUG_ON(new_i_size < i_size_read(inode));

828

BUG_ON(new_i_size < i_size_read(inode));

829

830

/*

830

/*

831

* Fall through for converting inline data, even if the fs

831

* Fall through for converting inline data, even if the fs

832

* supports sparse files.

832

* supports sparse files.

833

*

833

*

834

* The check for inline data here is legal - nobody can add

834

* The check for inline data here is legal - nobody can add

835

* the feature since we have i_mutex. We must check it again

835

* the feature since we have i_mutex. We must check it again

836

* after acquiring ip_alloc_sem though, as paths like mmap

836

* after acquiring ip_alloc_sem though, as paths like mmap

837

* might have raced us to converting the inode to extents.

837

* might have raced us to converting the inode to extents.

838

*/

838

*/

839

if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)

839

if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)

840

&& ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))

840

&& ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))

841

goto out_update_size;

841

goto out_update_size;

842

843

/*

843

/*

844

* The alloc sem blocks people in read/write from reading our

844

* The alloc sem blocks people in read/write from reading our

845

* allocation until we're done changing it. We depend on

845

* allocation until we're done changing it. We depend on

846

* i_mutex to block other extend/truncate calls while we're

846

* i_mutex to block other extend/truncate calls while we're

847

* here.

847

* here.

848

*/

848

*/

849

down_write(&oi->ip_alloc_sem);

849

down_write(&oi->ip_alloc_sem);

850

851

if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

851

if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

852

/*

852

/*

853

* We can optimize small extends by keeping the inodes

853

* We can optimize small extends by keeping the inodes

854

* inline data.

854

* inline data.

855

*/

855

*/

856

if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {

856

if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {

857

up_write(&oi->ip_alloc_sem);

857

up_write(&oi->ip_alloc_sem);

858

goto out_update_size;

858

goto out_update_size;

859

}

859

}

860

861

ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);

861

ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);

862

if (ret) {

862

if (ret) {

863

up_write(&oi->ip_alloc_sem);

863

up_write(&oi->ip_alloc_sem);

864

865

mlog_errno(ret);

865

mlog_errno(ret);

866

goto out;

866

goto out;

867

}

867

}

868

}

868

}

869

870

if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))

870

if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))

871

ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);

871

ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);

872

873

up_write(&oi->ip_alloc_sem);

873

up_write(&oi->ip_alloc_sem);

874

875

if (ret < 0) {

875

if (ret < 0) {

876

mlog_errno(ret);

876

mlog_errno(ret);

877

goto out;

877

goto out;

878

}

878

}

879

880

out_update_size:

880

out_update_size:

881

ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);

881

ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);

882

if (ret < 0)

882

if (ret < 0)

883

mlog_errno(ret);

883

mlog_errno(ret);

884

885

out:

885

out:

886

return ret;

886

return ret;

887

}

887

}

888

889

int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)

889

int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)

890

{

890

{

891

int status = 0, size_change;

891

int status = 0, size_change;

892

struct inode *inode = dentry->d_inode;

892

struct inode *inode = dentry->d_inode;

893

struct super_block *sb = inode->i_sb;

893

struct super_block *sb = inode->i_sb;

894

struct ocfs2_super *osb = OCFS2_SB(sb);

894

struct ocfs2_super *osb = OCFS2_SB(sb);

895

struct buffer_head *bh = NULL;

895

struct buffer_head *bh = NULL;

896

handle_t *handle = NULL;

896

handle_t *handle = NULL;

897

int locked[MAXQUOTAS] = {0, 0};

897

int locked[MAXQUOTAS] = {0, 0};

898

int credits, qtype;

898

int credits, qtype;

899

struct ocfs2_mem_dqinfo *oinfo;

899

struct ocfs2_mem_dqinfo *oinfo;

900

901

mlog_entry("(0x%p, '%.*s')\n", dentry,

901

mlog_entry("(0x%p, '%.*s')\n", dentry,

902

dentry->d_name.len, dentry->d_name.name);

902

dentry->d_name.len, dentry->d_name.name);

903

904

/* ensuring we don't even attempt to truncate a symlink */

904

/* ensuring we don't even attempt to truncate a symlink */

905

if (S_ISLNK(inode->i_mode))

905

if (S_ISLNK(inode->i_mode))

906

attr->ia_valid &= ~ATTR_SIZE;

906

attr->ia_valid &= ~ATTR_SIZE;

907

908

if (attr->ia_valid & ATTR_MODE)

908

if (attr->ia_valid & ATTR_MODE)

909

mlog(0, "mode change: %d\n", attr->ia_mode);

909

mlog(0, "mode change: %d\n", attr->ia_mode);

910

if (attr->ia_valid & ATTR_UID)

910

if (attr->ia_valid & ATTR_UID)

911

mlog(0, "uid change: %d\n", attr->ia_uid);

911

mlog(0, "uid change: %d\n", attr->ia_uid);

912

if (attr->ia_valid & ATTR_GID)

912

if (attr->ia_valid & ATTR_GID)

913

mlog(0, "gid change: %d\n", attr->ia_gid);

913

mlog(0, "gid change: %d\n", attr->ia_gid);

914

if (attr->ia_valid & ATTR_SIZE)

914

if (attr->ia_valid & ATTR_SIZE)

915

mlog(0, "size change...\n");

915

mlog(0, "size change...\n");

916

if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))

916

if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))

917

mlog(0, "time change...\n");

917

mlog(0, "time change...\n");

918

919

#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \

919

#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \

920

| ATTR_GID | ATTR_UID | ATTR_MODE)

920

| ATTR_GID | ATTR_UID | ATTR_MODE)

921

if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {

921

if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {

922

mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);

922

mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);

923

return 0;

923

return 0;

924

}

924

}

925

926

status = inode_change_ok(inode, attr);

926

status = inode_change_ok(inode, attr);

927

if (status)

927

if (status)

928

return status;

928

return status;

929

930

size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;

930

size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;

931

if (size_change) {

931

if (size_change) {

932

status = ocfs2_rw_lock(inode, 1);

932

status = ocfs2_rw_lock(inode, 1);

933

if (status < 0) {

933

if (status < 0) {

934

mlog_errno(status);

934

mlog_errno(status);

935

goto bail;

935

goto bail;

936

}

936

}

937

}

937

}

938

939

status = ocfs2_inode_lock(inode, &bh, 1);

939

status = ocfs2_inode_lock(inode, &bh, 1);

940

if (status < 0) {

940

if (status < 0) {

941

if (status != -ENOENT)

941

if (status != -ENOENT)

942

mlog_errno(status);

942

mlog_errno(status);

943

goto bail_unlock_rw;

943

goto bail_unlock_rw;

944

}

944

}

945

946

if (size_change && attr->ia_size != i_size_read(inode)) {

946

if (size_change && attr->ia_size != i_size_read(inode)) {

947

if (attr->ia_size > sb->s_maxbytes) {

947

if (attr->ia_size > sb->s_maxbytes) {

948

status = -EFBIG;

948

status = -EFBIG;

949

goto bail_unlock;

949

goto bail_unlock;

950

}

950

}

951

952

if (i_size_read(inode) > attr->ia_size) {

952

if (i_size_read(inode) > attr->ia_size) {

953

if (ocfs2_should_order_data(inode)) {

953

if (ocfs2_should_order_data(inode)) {

954

status = ocfs2_begin_ordered_truncate(inode,

954

status = ocfs2_begin_ordered_truncate(inode,

955

attr->ia_size);

955

attr->ia_size);

956

if (status)

956

if (status)

957

goto bail_unlock;

957

goto bail_unlock;

958

}

958

}

959

status = ocfs2_truncate_file(inode, bh, attr->ia_size);

959

status = ocfs2_truncate_file(inode, bh, attr->ia_size);

960

} else

960

} else

961

status = ocfs2_extend_file(inode, bh, attr->ia_size);

961

status = ocfs2_extend_file(inode, bh, attr->ia_size);

962

if (status < 0) {

962

if (status < 0) {

963

if (status != -ENOSPC)

963

if (status != -ENOSPC)

964

mlog_errno(status);

964

mlog_errno(status);

965

status = -ENOSPC;

965

status = -ENOSPC;

966

goto bail_unlock;

966

goto bail_unlock;

967

}

967

}

968

}

968

}

969

970

if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

970

if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

971

(attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

971

(attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

972

credits = OCFS2_INODE_UPDATE_CREDITS;

972

credits = OCFS2_INODE_UPDATE_CREDITS;

973

if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid

973

if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid

974

&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,

974

&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,

975

OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {

975

OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {

976

oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;

976

oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;

977

status = ocfs2_lock_global_qf(oinfo, 1);

977

status = ocfs2_lock_global_qf(oinfo, 1);

978

if (status < 0)

978

if (status < 0)

979

goto bail_unlock;

979

goto bail_unlock;

980

credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +

980

credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +

981

ocfs2_calc_qdel_credits(sb, USRQUOTA);

981

ocfs2_calc_qdel_credits(sb, USRQUOTA);

982

locked[USRQUOTA] = 1;

982

locked[USRQUOTA] = 1;

983

}

983

}

984

if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid

984

if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid

985

&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,

985

&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,

986

OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {

986

OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {

987

oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;

987

oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;

988

status = ocfs2_lock_global_qf(oinfo, 1);

988

status = ocfs2_lock_global_qf(oinfo, 1);

989

if (status < 0)

989

if (status < 0)

990

goto bail_unlock;

990

goto bail_unlock;

991

credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +

991

credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +

992

ocfs2_calc_qdel_credits(sb, GRPQUOTA);

992

ocfs2_calc_qdel_credits(sb, GRPQUOTA);

993

locked[GRPQUOTA] = 1;

993

locked[GRPQUOTA] = 1;

994

}

994

}

995

handle = ocfs2_start_trans(osb, credits);

995

handle = ocfs2_start_trans(osb, credits);

996

if (IS_ERR(handle)) {

996

if (IS_ERR(handle)) {

997

status = PTR_ERR(handle);

997

status = PTR_ERR(handle);

998

mlog_errno(status);

998

mlog_errno(status);

999

goto bail_unlock;

999

goto bail_unlock;

1000

}

1000

}

1001

status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;

1001

status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;

1002

if (status < 0)

1002

if (status < 0)

1003

goto bail_commit;

1003

goto bail_commit;

1004

} else {

1004

} else {

1005

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1005

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1006

if (IS_ERR(handle)) {

1006

if (IS_ERR(handle)) {

1007

status = PTR_ERR(handle);

1007

status = PTR_ERR(handle);

1008

mlog_errno(status);

1008

mlog_errno(status);

1009

goto bail_unlock;

1009

goto bail_unlock;

1010

}

1010

}

1011

}

1011

}

1012

1013

/*

1013

/*

1014

* This will intentionally not wind up calling vmtruncate(),

1014

* This will intentionally not wind up calling vmtruncate(),

1015

* since all the work for a size change has been done above.

1015

* since all the work for a size change has been done above.

1016

* Otherwise, we could get into problems with truncate as

1016

* Otherwise, we could get into problems with truncate as

1017

* ip_alloc_sem is used there to protect against i_size

1017

* ip_alloc_sem is used there to protect against i_size

1018

* changes.

1018

* changes.

1019

*/

1019

*/

1020

status = inode_setattr(inode, attr);

1020

status = inode_setattr(inode, attr);

1021

if (status < 0) {

1021

if (status < 0) {

1022

mlog_errno(status);

1022

mlog_errno(status);

1023

goto bail_commit;

1023

goto bail_commit;

1024

}

1024

}

1025

1026

status = ocfs2_mark_inode_dirty(handle, inode, bh);

1026

status = ocfs2_mark_inode_dirty(handle, inode, bh);

1027

if (status < 0)

1027

if (status < 0)

1028

mlog_errno(status);

1028

mlog_errno(status);

1029

1030

bail_commit:

1030

bail_commit:

1031

ocfs2_commit_trans(osb, handle);

1031

ocfs2_commit_trans(osb, handle);

1032

bail_unlock:

1032

bail_unlock:

1033

for (qtype = 0; qtype < MAXQUOTAS; qtype++) {

1033

for (qtype = 0; qtype < MAXQUOTAS; qtype++) {

1034

if (!locked[qtype])

1034

if (!locked[qtype])

1035

continue;

1035

continue;

1036

oinfo = sb_dqinfo(sb, qtype)->dqi_priv;

1036

oinfo = sb_dqinfo(sb, qtype)->dqi_priv;

1037

ocfs2_unlock_global_qf(oinfo, 1);

1037

ocfs2_unlock_global_qf(oinfo, 1);

1038

}

1038

}

1039

ocfs2_inode_unlock(inode, 1);

1039

ocfs2_inode_unlock(inode, 1);

1040

bail_unlock_rw:

1040

bail_unlock_rw:

1041

if (size_change)

1041

if (size_change)

1042

ocfs2_rw_unlock(inode, 1);

1042

ocfs2_rw_unlock(inode, 1);

1043

bail:

1043

bail:

1044

brelse(bh);

1044

brelse(bh);

1045

1046

if (!status && attr->ia_valid & ATTR_MODE) {

1046

if (!status && attr->ia_valid & ATTR_MODE) {

1047

status = ocfs2_acl_chmod(inode);

1047

status = ocfs2_acl_chmod(inode);

1048

if (status < 0)

1048

if (status < 0)

1049

mlog_errno(status);

1049

mlog_errno(status);

1050

}

1050

}

1051

1052

mlog_exit(status);

1052

mlog_exit(status);

1053

return status;

1053

return status;

1054

}

1054

}

1055

1056

int ocfs2_getattr(struct vfsmount *mnt,

1056

int ocfs2_getattr(struct vfsmount *mnt,

1057

struct dentry *dentry,

1057

struct dentry *dentry,

1058

struct kstat *stat)

1058

struct kstat *stat)

1059

{

1059

{

1060

struct inode *inode = dentry->d_inode;

1060

struct inode *inode = dentry->d_inode;

1061

struct super_block *sb = dentry->d_inode->i_sb;

1061

struct super_block *sb = dentry->d_inode->i_sb;

1062

struct ocfs2_super *osb = sb->s_fs_info;

1062

struct ocfs2_super *osb = sb->s_fs_info;

1063

int err;

1063

int err;

1064

1065

mlog_entry_void();

1065

mlog_entry_void();

1066

1067

err = ocfs2_inode_revalidate(dentry);

1067

err = ocfs2_inode_revalidate(dentry);

1068

if (err) {

1068

if (err) {

1069

if (err != -ENOENT)

1069

if (err != -ENOENT)

1070

mlog_errno(err);

1070

mlog_errno(err);

1071

goto bail;

1071

goto bail;

1072

}

1072

}

1073

1074

generic_fillattr(inode, stat);

1074

generic_fillattr(inode, stat);

1075

1076

/* We set the blksize from the cluster size for performance */

1076

/* We set the blksize from the cluster size for performance */

1077

stat->blksize = osb->s_clustersize;

1077

stat->blksize = osb->s_clustersize;

1078

1079

bail:

1079

bail:

1080

mlog_exit(err);

1080

mlog_exit(err);

1081

1082

return err;

1082

return err;

1083

}

1083

}

1084

1085

int ocfs2_permission(struct inode *inode, int mask)

1085

int ocfs2_permission(struct inode *inode, int mask)

1086

{

1086

{

1087

int ret;

1087

int ret;

1088

1089

mlog_entry_void();

1089

mlog_entry_void();

1090

1091

ret = ocfs2_inode_lock(inode, NULL, 0);

1091

ret = ocfs2_inode_lock(inode, NULL, 0);

1092

if (ret) {

1092

if (ret) {

1093

if (ret != -ENOENT)

1093

if (ret != -ENOENT)

1094

mlog_errno(ret);

1094

mlog_errno(ret);

1095

goto out;

1095

goto out;

1096

}

1096

}

1097

1098

ret = generic_permission(inode, mask, ocfs2_check_acl);

1098

ret = generic_permission(inode, mask, ocfs2_check_acl);

1099

1100

ocfs2_inode_unlock(inode, 0);

1100

ocfs2_inode_unlock(inode, 0);

1101

out:

1101

out:

1102

mlog_exit(ret);

1102

mlog_exit(ret);

1103

return ret;

1103

return ret;

1104

}

1104

}

1105

1106

static int __ocfs2_write_remove_suid(struct inode *inode,

1106

static int __ocfs2_write_remove_suid(struct inode *inode,

1107

struct buffer_head *bh)

1107

struct buffer_head *bh)

1108

{

1108

{

1109

int ret;

1109

int ret;

1110

handle_t *handle;

1110

handle_t *handle;

1111

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1111

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1112

struct ocfs2_dinode *di;

1112

struct ocfs2_dinode *di;

1113

1114

mlog_entry("(Inode %llu, mode 0%o)\n",

1114

mlog_entry("(Inode %llu, mode 0%o)\n",

1115

(unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);

1115

(unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);

1116

1117

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1117

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1118

if (IS_ERR(handle)) {

1118

if (IS_ERR(handle)) {

1119

ret = PTR_ERR(handle);

1119

ret = PTR_ERR(handle);

1120

mlog_errno(ret);

1120

mlog_errno(ret);

1121

goto out;

1121

goto out;

1122

}

1122

}

1123

1124

ret = ocfs2_journal_access_di(handle, inode, bh,

1124

ret = ocfs2_journal_access_di(handle, inode, bh,

1125

OCFS2_JOURNAL_ACCESS_WRITE);

1125

OCFS2_JOURNAL_ACCESS_WRITE);

1126

if (ret < 0) {

1126

if (ret < 0) {

1127

mlog_errno(ret);

1127

mlog_errno(ret);

1128

goto out_trans;

1128

goto out_trans;

1129

}

1129

}

1130

1131

inode->i_mode &= ~S_ISUID;

1131

inode->i_mode &= ~S_ISUID;

1132

if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))

1132

if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))

1133

inode->i_mode &= ~S_ISGID;

1133

inode->i_mode &= ~S_ISGID;

1134

1135

di = (struct ocfs2_dinode *) bh->b_data;

1135

di = (struct ocfs2_dinode *) bh->b_data;

1136

di->i_mode = cpu_to_le16(inode->i_mode);

1136

di->i_mode = cpu_to_le16(inode->i_mode);

1137

1138

ret = ocfs2_journal_dirty(handle, bh);

1138

ret = ocfs2_journal_dirty(handle, bh);

1139

if (ret < 0)

1139

if (ret < 0)

1140

mlog_errno(ret);

1140

mlog_errno(ret);

1141

1142

out_trans:

1142

out_trans:

1143

ocfs2_commit_trans(osb, handle);

1143

ocfs2_commit_trans(osb, handle);

1144

out:

1144

out:

1145

mlog_exit(ret);

1145

mlog_exit(ret);

1146

return ret;

1146

return ret;

1147

}

1147

}

1148

1149

/*

1149

/*

1150

* Will look for holes and unwritten extents in the range starting at

1150

* Will look for holes and unwritten extents in the range starting at

1151

* pos for count bytes (inclusive).

1151

* pos for count bytes (inclusive).

1152

*/

1152

*/

1153

static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,

1153

static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,

1154

size_t count)

1154

size_t count)

1155

{

1155

{

1156

int ret = 0;

1156

int ret = 0;

1157

unsigned int extent_flags;

1157

unsigned int extent_flags;

1158

u32 cpos, clusters, extent_len, phys_cpos;

1158

u32 cpos, clusters, extent_len, phys_cpos;

1159

struct super_block *sb = inode->i_sb;

1159

struct super_block *sb = inode->i_sb;

1160

1161

cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;

1161

cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;

1162

clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;

1162

clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;

1163

1164

while (clusters) {

1164

while (clusters) {

1165

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,

1165

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,

1166

&extent_flags);

1166

&extent_flags);

1167

if (ret < 0) {

1167

if (ret < 0) {

1168

mlog_errno(ret);

1168

mlog_errno(ret);

1169

goto out;

1169

goto out;

1170

}

1170

}

1171

1172

if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {

1172

if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {

1173

ret = 1;

1173

ret = 1;

1174

break;

1174

break;

1175

}

1175

}

1176

1177

if (extent_len > clusters)

1177

if (extent_len > clusters)

1178

extent_len = clusters;

1178

extent_len = clusters;

1179

1180

clusters -= extent_len;

1180

clusters -= extent_len;

1181

cpos += extent_len;

1181

cpos += extent_len;

1182

}

1182

}

1183

out:

1183

out:

1184

return ret;

1184

return ret;

1185

}

1185

}

1186

1187

static int ocfs2_write_remove_suid(struct inode *inode)

1187

static int ocfs2_write_remove_suid(struct inode *inode)

1188

{

1188

{

1189

int ret;

1189

int ret;

1190

struct buffer_head *bh = NULL;

1190

struct buffer_head *bh = NULL;

1191

1192

ret = ocfs2_read_inode_block(inode, &bh);

1192

ret = ocfs2_read_inode_block(inode, &bh);

1193

if (ret < 0) {

1193

if (ret < 0) {

1194

mlog_errno(ret);

1194

mlog_errno(ret);

1195

goto out;

1195

goto out;

1196

}

1196

}

1197

1198

ret = __ocfs2_write_remove_suid(inode, bh);

1198

ret = __ocfs2_write_remove_suid(inode, bh);

1199

out:

1199

out:

1200

brelse(bh);

1200

brelse(bh);

1201

return ret;

1201

return ret;

1202

}

1202

}

1203

1204

/*

1204

/*

1205

* Allocate enough extents to cover the region starting at byte offset

1205

* Allocate enough extents to cover the region starting at byte offset

1206

* start for len bytes. Existing extents are skipped, any extents

1206

* start for len bytes. Existing extents are skipped, any extents

1207

* added are marked as "unwritten".

1207

* added are marked as "unwritten".

1208

*/

1208

*/

1209

static int ocfs2_allocate_unwritten_extents(struct inode *inode,

1209

static int ocfs2_allocate_unwritten_extents(struct inode *inode,

1210

u64 start, u64 len)

1210

u64 start, u64 len)

1211

{

1211

{

1212

int ret;

1212

int ret;

1213

u32 cpos, phys_cpos, clusters, alloc_size;

1213

u32 cpos, phys_cpos, clusters, alloc_size;

1214

u64 end = start + len;

1214

u64 end = start + len;

1215

struct buffer_head *di_bh = NULL;

1215

struct buffer_head *di_bh = NULL;

1216

1217

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1217

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1218

ret = ocfs2_read_inode_block(inode, &di_bh);

1218

ret = ocfs2_read_inode_block(inode, &di_bh);

1219

if (ret) {

1219

if (ret) {

1220

mlog_errno(ret);

1220

mlog_errno(ret);

1221

goto out;

1221

goto out;

1222

}

1222

}

1223

1224

/*

1224

/*

1225

* Nothing to do if the requested reservation range

1225

* Nothing to do if the requested reservation range

1226

* fits within the inode.

1226

* fits within the inode.

1227

*/

1227

*/

1228

if (ocfs2_size_fits_inline_data(di_bh, end))

1228

if (ocfs2_size_fits_inline_data(di_bh, end))

1229

goto out;

1229

goto out;

1230

1231

ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);

1231

ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);

1232

if (ret) {

1232

if (ret) {

1233

mlog_errno(ret);

1233

mlog_errno(ret);

1234

goto out;

1234

goto out;

1235

}

1235

}

1236

}

1236

}

1237

1238

/*

1238

/*

1239

* We consider both start and len to be inclusive.

1239

* We consider both start and len to be inclusive.

1240

*/

1240

*/

1241

cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;

1241

cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;

1242

clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);

1242

clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);

1243

clusters -= cpos;

1243

clusters -= cpos;

1244

1245

while (clusters) {

1245

while (clusters) {

1246

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,

1246

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,

1247

&alloc_size, NULL);

1247

&alloc_size, NULL);

1248

if (ret) {

1248

if (ret) {

1249

mlog_errno(ret);

1249

mlog_errno(ret);

1250

goto out;

1250

goto out;

1251

}

1251

}

1252

1253

/*

1253

/*

1254

* Hole or existing extent len can be arbitrary, so

1254

* Hole or existing extent len can be arbitrary, so

1255

* cap it to our own allocation request.

1255

* cap it to our own allocation request.

1256

*/

1256

*/

1257

if (alloc_size > clusters)

1257

if (alloc_size > clusters)

1258

alloc_size = clusters;

1258

alloc_size = clusters;

1259

1260

if (phys_cpos) {

1260

if (phys_cpos) {

1261

/*

1261

/*

1262

* We already have an allocation at this

1262

* We already have an allocation at this

1263

* region so we can safely skip it.

1263

* region so we can safely skip it.

1264

*/

1264

*/

1265

goto next;

1265

goto next;

1266

}

1266

}

1267

1268

ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);

1268

ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);

1269

if (ret) {

1269

if (ret) {

1270

if (ret != -ENOSPC)

1270

if (ret != -ENOSPC)

1271

mlog_errno(ret);

1271

mlog_errno(ret);

1272

goto out;

1272

goto out;

1273

}

1273

}

1274

1275

cpos += alloc_size;

1276

cpos += alloc_size;

1277

clusters -= alloc_size;

1277

clusters -= alloc_size;

1278

}

1278

}

1279

1280

ret = 0;

1280

ret = 0;

1281

out:

1281

out:

1282

1283

brelse(di_bh);

1283

brelse(di_bh);

1284

return ret;

1284

return ret;

1285

}

1285

}

1286

1287

/*

1287

/*

1288

* Truncate a byte range, avoiding pages within partial clusters. This

1288

* Truncate a byte range, avoiding pages within partial clusters. This

1289

* preserves those pages for the zeroing code to write to.

1289

* preserves those pages for the zeroing code to write to.

1290

*/

1290

*/

1291

static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,

1291

static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,

1292

u64 byte_len)

1292

u64 byte_len)

1293

{

1293

{

1294

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1294

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1295

loff_t start, end;

1295

loff_t start, end;

1296

struct address_space *mapping = inode->i_mapping;

1296

struct address_space *mapping = inode->i_mapping;

1297

1298

start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);

1298

start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);

1299

end = byte_start + byte_len;

1299

end = byte_start + byte_len;

1300

end = end & ~(osb->s_clustersize - 1);

1300

end = end & ~(osb->s_clustersize - 1);

1301

1302

if (start < end) {

1302

if (start < end) {

1303

unmap_mapping_range(mapping, start, end - start, 0);

1303

unmap_mapping_range(mapping, start, end - start, 0);

1304

truncate_inode_pages_range(mapping, start, end - 1);

1304

truncate_inode_pages_range(mapping, start, end - 1);

1305

}

1305

}

1306

}

1306

}

1307

1308

static int ocfs2_zero_partial_clusters(struct inode *inode,

1308

static int ocfs2_zero_partial_clusters(struct inode *inode,

1309

u64 start, u64 len)

1309

u64 start, u64 len)

1310

{

1310

{

1311

int ret = 0;

1311

int ret = 0;

1312

u64 tmpend, end = start + len;

1312

u64 tmpend, end = start + len;

1313

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1313

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1314

unsigned int csize = osb->s_clustersize;

1314

unsigned int csize = osb->s_clustersize;

1315

handle_t *handle;

1315

handle_t *handle;

1316

1317

/*

1317

/*

1318

* The "start" and "end" values are NOT necessarily part of

1318

* The "start" and "end" values are NOT necessarily part of

1319

* the range whose allocation is being deleted. Rather, this

1319

* the range whose allocation is being deleted. Rather, this

1320

* is what the user passed in with the request. We must zero

1320

* is what the user passed in with the request. We must zero

1321

* partial clusters here. There's no need to worry about

1321

* partial clusters here. There's no need to worry about

1322

* physical allocation - the zeroing code knows to skip holes.

1322

* physical allocation - the zeroing code knows to skip holes.

1323

*/

1323

*/

1324

mlog(0, "byte start: %llu, end: %llu\n",

1324

mlog(0, "byte start: %llu, end: %llu\n",

1325

(unsigned long long)start, (unsigned long long)end);

1325

(unsigned long long)start, (unsigned long long)end);

1326

1327

/*

1327

/*

1328

* If both edges are on a cluster boundary then there's no

1328

* If both edges are on a cluster boundary then there's no

1329

* zeroing required as the region is part of the allocation to

1329

* zeroing required as the region is part of the allocation to

1330

* be truncated.

1330

* be truncated.

1331

*/

1331

*/

1332

if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)

1332

if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)

1333

goto out;

1333

goto out;

1334

1335

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1335

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1336

if (IS_ERR(handle)) {

1336

if (IS_ERR(handle)) {

1337

ret = PTR_ERR(handle);

1337

ret = PTR_ERR(handle);

1338

mlog_errno(ret);

1338

mlog_errno(ret);

1339

goto out;

1339

goto out;

1340

}

1340

}

1341

1342

/*

1342

/*

1343

* We want to get the byte offset of the end of the 1st cluster.

1343

* We want to get the byte offset of the end of the 1st cluster.

1344

*/

1344

*/

1345

tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));

1345

tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));

1346

if (tmpend > end)

1346

if (tmpend > end)

1347

tmpend = end;

1347

tmpend = end;

1348

1349

mlog(0, "1st range: start: %llu, tmpend: %llu\n",

1349

mlog(0, "1st range: start: %llu, tmpend: %llu\n",

1350

(unsigned long long)start, (unsigned long long)tmpend);

1350

(unsigned long long)start, (unsigned long long)tmpend);

1351

1352

ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);

1352

ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);

1353

if (ret)

1353

if (ret)

1354

mlog_errno(ret);

1354

mlog_errno(ret);

1355

1356

if (tmpend < end) {

1356

if (tmpend < end) {

1357

/*

1357

/*

1358

* This may make start and end equal, but the zeroing

1358

* This may make start and end equal, but the zeroing

1359

* code will skip any work in that case so there's no

1359

* code will skip any work in that case so there's no

1360

* need to catch it up here.

1360

* need to catch it up here.

1361

*/

1361

*/

1362

start = end & ~(osb->s_clustersize - 1);

1362

start = end & ~(osb->s_clustersize - 1);

1363

1364

mlog(0, "2nd range: start: %llu, end: %llu\n",

1364

mlog(0, "2nd range: start: %llu, end: %llu\n",

1365

(unsigned long long)start, (unsigned long long)end);

1365

(unsigned long long)start, (unsigned long long)end);

1366

1367

ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);

1367

ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);

1368

if (ret)

1368

if (ret)

1369

mlog_errno(ret);

1369

mlog_errno(ret);

1370

}

1370

}

1371

1372

ocfs2_commit_trans(osb, handle);

1372

ocfs2_commit_trans(osb, handle);

1373

out:

1373

out:

1374

return ret;

1374

return ret;

1375

}

1375

}

1376

1377

static int ocfs2_remove_inode_range(struct inode *inode,

1377

static int ocfs2_remove_inode_range(struct inode *inode,

1378

struct buffer_head *di_bh, u64 byte_start,

1378

struct buffer_head *di_bh, u64 byte_start,

1379

u64 byte_len)

1379

u64 byte_len)

1380

{

1380

{

1381

int ret = 0;

1381

int ret = 0;

1382

u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;

1382

u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;

1383

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1383

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1384

struct ocfs2_cached_dealloc_ctxt dealloc;

1384

struct ocfs2_cached_dealloc_ctxt dealloc;

1385

struct address_space *mapping = inode->i_mapping;

1385

struct address_space *mapping = inode->i_mapping;

1386

struct ocfs2_extent_tree et;

1386

struct ocfs2_extent_tree et;

1387

1388

ocfs2_init_dinode_extent_tree(&et, inode, di_bh);

1388

ocfs2_init_dinode_extent_tree(&et, inode, di_bh);

1389

ocfs2_init_dealloc_ctxt(&dealloc);

1389

ocfs2_init_dealloc_ctxt(&dealloc);

1390

1391

if (byte_len == 0)

1391

if (byte_len == 0)

1392

return 0;

1392

return 0;

1393

1394

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1394

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1395

ret = ocfs2_truncate_inline(inode, di_bh, byte_start,

1395

ret = ocfs2_truncate_inline(inode, di_bh, byte_start,

1396

byte_start + byte_len, 0);

1396

byte_start + byte_len, 0);

1397

if (ret) {

1397

if (ret) {

1398

mlog_errno(ret);

1398

mlog_errno(ret);

1399

goto out;

1399

goto out;

1400

}

1400

}

1401

/*

1401

/*

1402

* There's no need to get fancy with the page cache

1402

* There's no need to get fancy with the page cache

1403

* truncate of an inline-data inode. We're talking

1403

* truncate of an inline-data inode. We're talking

1404

* about less than a page here, which will be cached

1404

* about less than a page here, which will be cached

1405

* in the dinode buffer anyway.

1405

* in the dinode buffer anyway.

1406

*/

1406

*/

1407

unmap_mapping_range(mapping, 0, 0, 0);

1407

unmap_mapping_range(mapping, 0, 0, 0);

1408

truncate_inode_pages(mapping, 0);

1408

truncate_inode_pages(mapping, 0);

1409

goto out;

1409

goto out;

1410

}

1410

}

1411

1412

trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);

1412

trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);

1413

trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;

1413

trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;

1414

if (trunc_len >= trunc_start)

1414

if (trunc_len >= trunc_start)

1415

trunc_len -= trunc_start;

1415

trunc_len -= trunc_start;

1416

else

1416

else

1417

trunc_len = 0;

1417

trunc_len = 0;

1418

1419

mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",

1419

mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",

1420

(unsigned long long)OCFS2_I(inode)->ip_blkno,

1420

(unsigned long long)OCFS2_I(inode)->ip_blkno,

1421

(unsigned long long)byte_start,

1421

(unsigned long long)byte_start,

1422

(unsigned long long)byte_len, trunc_start, trunc_len);

1422

(unsigned long long)byte_len, trunc_start, trunc_len);

1423

1424

ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);

1424

ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);

1425

if (ret) {

1425

if (ret) {

1426

mlog_errno(ret);

1426

mlog_errno(ret);

1427

goto out;

1427

goto out;

1428

}

1428

}

1429

1430

cpos = trunc_start;

1430

cpos = trunc_start;

1431

while (trunc_len) {

1431

while (trunc_len) {

1432

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,

1432

ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,

1433

&alloc_size, NULL);

1433

&alloc_size, NULL);

1434

if (ret) {

1434

if (ret) {

1435

mlog_errno(ret);

1435

mlog_errno(ret);

1436

goto out;

1436

goto out;

1437

}

1437

}

1438

1439

if (alloc_size > trunc_len)

1439

if (alloc_size > trunc_len)

1440

alloc_size = trunc_len;

1440

alloc_size = trunc_len;

1441

1442

/* Only do work for non-holes */

1442

/* Only do work for non-holes */

1443

if (phys_cpos != 0) {

1443

if (phys_cpos != 0) {

1444

ret = ocfs2_remove_btree_range(inode, &et, cpos,

1444

ret = ocfs2_remove_btree_range(inode, &et, cpos,

1445

phys_cpos, alloc_size,

1445

phys_cpos, alloc_size,

1446

&dealloc);

1446

&dealloc);

1447

if (ret) {

1447

if (ret) {

1448

mlog_errno(ret);

1448

mlog_errno(ret);

1449

goto out;

1449

goto out;

1450

}

1450

}

1451

}

1451

}

1452

1453

cpos += alloc_size;

1453

cpos += alloc_size;

1454

trunc_len -= alloc_size;

1454

trunc_len -= alloc_size;

1455

}

1455

}

1456

1457

ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);

1457

ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);

1458

1459

out:

1459

out:

1460

ocfs2_schedule_truncate_log_flush(osb, 1);

1460

ocfs2_schedule_truncate_log_flush(osb, 1);

1461

ocfs2_run_deallocs(osb, &dealloc);

1461

ocfs2_run_deallocs(osb, &dealloc);

1462

1463

return ret;

1463

return ret;

1464

}

1464

}

1465

1466

/*

1466

/*

1467

* Parts of this function taken from xfs_change_file_space()

1467

* Parts of this function taken from xfs_change_file_space()

1468

*/

1468

*/

1469

static int __ocfs2_change_file_space(struct file *file, struct inode *inode,

1469

static int __ocfs2_change_file_space(struct file *file, struct inode *inode,

1470

loff_t f_pos, unsigned int cmd,

1470

loff_t f_pos, unsigned int cmd,

1471

struct ocfs2_space_resv *sr,

1471

struct ocfs2_space_resv *sr,

1472

int change_size)

1472

int change_size)

1473

{

1473

{

1474

int ret;

1474

int ret;

1475

s64 llen;

1475

s64 llen;

1476

loff_t size;

1476

loff_t size;

1477

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1477

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1478

struct buffer_head *di_bh = NULL;

1478

struct buffer_head *di_bh = NULL;

1479

handle_t *handle;

1479

handle_t *handle;

1480

unsigned long long max_off = inode->i_sb->s_maxbytes;

1480

unsigned long long max_off = inode->i_sb->s_maxbytes;

1481

1482

if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))

1482

if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))

1483

return -EROFS;

1483

return -EROFS;

1484

1485

mutex_lock(&inode->i_mutex);

1485

mutex_lock(&inode->i_mutex);

1486

1487

/*

1487

/*

1488

* This prevents concurrent writes on other nodes

1488

* This prevents concurrent writes on other nodes

1489

*/

1489

*/

1490

ret = ocfs2_rw_lock(inode, 1);

1490

ret = ocfs2_rw_lock(inode, 1);

1491

if (ret) {

1491

if (ret) {

1492

mlog_errno(ret);

1492

mlog_errno(ret);

1493

goto out;

1493

goto out;

1494

}

1494

}

1495

1496

ret = ocfs2_inode_lock(inode, &di_bh, 1);

1496

ret = ocfs2_inode_lock(inode, &di_bh, 1);

1497

if (ret) {

1497

if (ret) {

1498

mlog_errno(ret);

1498

mlog_errno(ret);

1499

goto out_rw_unlock;

1499

goto out_rw_unlock;

1500

}

1500

}

1501

1502

if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {

1502

if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {

1503

ret = -EPERM;

1503

ret = -EPERM;

1504

goto out_inode_unlock;

1504

goto out_inode_unlock;

1505

}

1505

}

1506

1507

switch (sr->l_whence) {

1507

switch (sr->l_whence) {

1508

case 0: /*SEEK_SET*/

1508

case 0: /*SEEK_SET*/

1509

break;

1509

break;

1510

case 1: /*SEEK_CUR*/

1510

case 1: /*SEEK_CUR*/

1511

sr->l_start += f_pos;

1511

sr->l_start += f_pos;

1512

break;

1512

break;

1513

case 2: /*SEEK_END*/

1513

case 2: /*SEEK_END*/

1514

sr->l_start += i_size_read(inode);

1514

sr->l_start += i_size_read(inode);

1515

break;

1515

break;

1516

default:

1516

default:

1517

ret = -EINVAL;

1517

ret = -EINVAL;

1518

goto out_inode_unlock;

1518

goto out_inode_unlock;

1519

}

1519

}

1520

sr->l_whence = 0;

1520

sr->l_whence = 0;

1521

1522

llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;

1522

llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;

1523

1524

if (sr->l_start < 0

1524

if (sr->l_start < 0

1525

|| sr->l_start > max_off

1525

|| sr->l_start > max_off

1526

|| (sr->l_start + llen) < 0

1526

|| (sr->l_start + llen) < 0

1527

|| (sr->l_start + llen) > max_off) {

1527

|| (sr->l_start + llen) > max_off) {

1528

ret = -EINVAL;

1528

ret = -EINVAL;

1529

goto out_inode_unlock;

1529

goto out_inode_unlock;

1530

}

1530

}

1531

size = sr->l_start + sr->l_len;

1531

size = sr->l_start + sr->l_len;

1532

1533

if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {

1533

if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {

1534

if (sr->l_len <= 0) {

1534

if (sr->l_len <= 0) {

1535

ret = -EINVAL;

1535

ret = -EINVAL;

1536

goto out_inode_unlock;

1536

goto out_inode_unlock;

1537

}

1537

}

1538

}

1538

}

1539

1540

if (file && should_remove_suid(file->f_path.dentry)) {

1540

if (file && should_remove_suid(file->f_path.dentry)) {

1541

ret = __ocfs2_write_remove_suid(inode, di_bh);

1541

ret = __ocfs2_write_remove_suid(inode, di_bh);

1542

if (ret) {

1542

if (ret) {

1543

mlog_errno(ret);

1543

mlog_errno(ret);

1544

goto out_inode_unlock;

1544

goto out_inode_unlock;

1545

}

1545

}

1546

}

1546

}

1547

1548

down_write(&OCFS2_I(inode)->ip_alloc_sem);

1548

down_write(&OCFS2_I(inode)->ip_alloc_sem);

1549

switch (cmd) {

1549

switch (cmd) {

1550

case OCFS2_IOC_RESVSP:

1550

case OCFS2_IOC_RESVSP:

1551

case OCFS2_IOC_RESVSP64:

1551

case OCFS2_IOC_RESVSP64:

1552

/*

1552

/*

1553

* This takes unsigned offsets, but the signed ones we

1553

* This takes unsigned offsets, but the signed ones we

1554

* pass have been checked against overflow above.

1554

* pass have been checked against overflow above.

1555

*/

1555

*/

1556

ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,

1556

ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,

1557

sr->l_len);

1557

sr->l_len);

1558

break;

1558

break;

1559

case OCFS2_IOC_UNRESVSP:

1559

case OCFS2_IOC_UNRESVSP:

1560

case OCFS2_IOC_UNRESVSP64:

1560

case OCFS2_IOC_UNRESVSP64:

1561

ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,

1561

ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,

1562

sr->l_len);

1562

sr->l_len);

1563

break;

1563

break;

1564

default:

1564

default:

1565

ret = -EINVAL;

1565

ret = -EINVAL;

1566

}

1566

}

1567

up_write(&OCFS2_I(inode)->ip_alloc_sem);

1567

up_write(&OCFS2_I(inode)->ip_alloc_sem);

1568

if (ret) {

1568

if (ret) {

1569

mlog_errno(ret);

1569

mlog_errno(ret);

1570

goto out_inode_unlock;

1570

goto out_inode_unlock;

1571

}

1571

}

1572

1573

/*

1573

/*

1574

* We update c/mtime for these changes

1574

* We update c/mtime for these changes

1575

*/

1575

*/

1576

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1576

handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);

1577

if (IS_ERR(handle)) {

1577

if (IS_ERR(handle)) {

1578

ret = PTR_ERR(handle);

1578

ret = PTR_ERR(handle);

1579

mlog_errno(ret);

1579

mlog_errno(ret);

1580

goto out_inode_unlock;

1580

goto out_inode_unlock;

1581

}

1581

}

1582

1583

if (change_size && i_size_read(inode) < size)

1583

if (change_size && i_size_read(inode) < size)

1584

i_size_write(inode, size);

1584

i_size_write(inode, size);

1585

1586

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

1586

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

1587

ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);

1587

ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);

1588

if (ret < 0)

1588

if (ret < 0)

1589

mlog_errno(ret);

1589

mlog_errno(ret);

1590

1591

ocfs2_commit_trans(osb, handle);

1591

ocfs2_commit_trans(osb, handle);

1592

1593

out_inode_unlock:

1593

out_inode_unlock:

1594

brelse(di_bh);

1594

brelse(di_bh);

1595

ocfs2_inode_unlock(inode, 1);

1595

ocfs2_inode_unlock(inode, 1);

1596

out_rw_unlock:

1596

out_rw_unlock:

1597

ocfs2_rw_unlock(inode, 1);

1597

ocfs2_rw_unlock(inode, 1);

1598

1599

out:

1599

out:

1600

mutex_unlock(&inode->i_mutex);

1600

mutex_unlock(&inode->i_mutex);

1601

return ret;

1601

return ret;

1602

}

1602

}

1603

1604

int ocfs2_change_file_space(struct file *file, unsigned int cmd,

1604

int ocfs2_change_file_space(struct file *file, unsigned int cmd,

1605

struct ocfs2_space_resv *sr)

1605

struct ocfs2_space_resv *sr)

1606

{

1606

{

1607

struct inode *inode = file->f_path.dentry->d_inode;

1607

struct inode *inode = file->f_path.dentry->d_inode;

1608

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1608

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1609

1610

if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&

1610

if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&

1611

!ocfs2_writes_unwritten_extents(osb))

1611

!ocfs2_writes_unwritten_extents(osb))

1612

return -ENOTTY;

1612

return -ENOTTY;

1613

else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&

1613

else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&

1614

!ocfs2_sparse_alloc(osb))

1614

!ocfs2_sparse_alloc(osb))

1615

return -ENOTTY;

1615

return -ENOTTY;

1616

1617

if (!S_ISREG(inode->i_mode))

1617

if (!S_ISREG(inode->i_mode))

1618

return -EINVAL;

1618

return -EINVAL;

1619

1620

if (!(file->f_mode & FMODE_WRITE))

1620

if (!(file->f_mode & FMODE_WRITE))

1621

return -EBADF;

1621

return -EBADF;

1622

1623

return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);

1623

return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);

1624

}

1624

}

1625

1626

static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,

1626

static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,

1627

loff_t len)

1627

loff_t len)

1628

{

1628

{

1629

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1629

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1630

struct ocfs2_space_resv sr;

1630

struct ocfs2_space_resv sr;

1631

int change_size = 1;

1631

int change_size = 1;

1632

1633

if (!ocfs2_writes_unwritten_extents(osb))

1633

if (!ocfs2_writes_unwritten_extents(osb))

1634

return -EOPNOTSUPP;

1634

return -EOPNOTSUPP;

1635

1636

if (S_ISDIR(inode->i_mode))

1636

if (S_ISDIR(inode->i_mode))

1637

return -ENODEV;

1637

return -ENODEV;

1638

1639

if (mode & FALLOC_FL_KEEP_SIZE)

1639

if (mode & FALLOC_FL_KEEP_SIZE)

1640

change_size = 0;

1640

change_size = 0;

1641

1642

sr.l_whence = 0;

1642

sr.l_whence = 0;

1643

sr.l_start = (s64)offset;

1643

sr.l_start = (s64)offset;

1644

sr.l_len = (s64)len;

1644

sr.l_len = (s64)len;

1645

1646

return __ocfs2_change_file_space(NULL, inode, offset,

1646

return __ocfs2_change_file_space(NULL, inode, offset,

1647

OCFS2_IOC_RESVSP64, &sr, change_size);

1647

OCFS2_IOC_RESVSP64, &sr, change_size);

1648

}

1648

}

1649

1650

static int ocfs2_prepare_inode_for_write(struct dentry *dentry,

1650

static int ocfs2_prepare_inode_for_write(struct dentry *dentry,

1651

loff_t *ppos,

1651

loff_t *ppos,

1652

size_t count,

1652

size_t count,

1653

int appending,

1653

int appending,

1654

int *direct_io)

1654

int *direct_io)

1655

{

1655

{

1656

int ret = 0, meta_level = 0;

1656

int ret = 0, meta_level = 0;

1657

struct inode *inode = dentry->d_inode;

1657

struct inode *inode = dentry->d_inode;

1658

loff_t saved_pos, end;

1658

loff_t saved_pos, end;

1659

1660

/*

1660

/*

1661

* We start with a read level meta lock and only jump to an ex

1661

* We start with a read level meta lock and only jump to an ex

1662

* if we need to make modifications here.

1662

* if we need to make modifications here.

1663

*/

1663

*/

1664

for(;;) {

1664

for(;;) {

1665

ret = ocfs2_inode_lock(inode, NULL, meta_level);

1665

ret = ocfs2_inode_lock(inode, NULL, meta_level);

1666

if (ret < 0) {

1666

if (ret < 0) {

1667

meta_level = -1;

1667

meta_level = -1;

1668

mlog_errno(ret);

1668

mlog_errno(ret);

1669

goto out;

1669

goto out;

1670

}

1670

}

1671

1672

/* Clear suid / sgid if necessary. We do this here

1672

/* Clear suid / sgid if necessary. We do this here

1673

* instead of later in the write path because

1673

* instead of later in the write path because

1674

* remove_suid() calls ->setattr without any hint that

1674

* remove_suid() calls ->setattr without any hint that

1675

* we may have already done our cluster locking. Since

1675

* we may have already done our cluster locking. Since

1676

* ocfs2_setattr() *must* take cluster locks to

1676

* ocfs2_setattr() *must* take cluster locks to

1677

* proceeed, this will lead us to recursively lock the

1677

* proceeed, this will lead us to recursively lock the

1678

* inode. There's also the dinode i_size state which

1678

* inode. There's also the dinode i_size state which

1679

* can be lost via setattr during extending writes (we

1679

* can be lost via setattr during extending writes (we

1680

* set inode->i_size at the end of a write. */

1680

* set inode->i_size at the end of a write. */

1681

if (should_remove_suid(dentry)) {

1681

if (should_remove_suid(dentry)) {

1682

if (meta_level == 0) {

1682

if (meta_level == 0) {

1683

ocfs2_inode_unlock(inode, meta_level);

1683

ocfs2_inode_unlock(inode, meta_level);

1684

meta_level = 1;

1684

meta_level = 1;

1685

continue;

1685

continue;

1686

}

1686

}

1687

1688

ret = ocfs2_write_remove_suid(inode);

1688

ret = ocfs2_write_remove_suid(inode);

1689

if (ret < 0) {

1689

if (ret < 0) {

1690

mlog_errno(ret);

1690

mlog_errno(ret);

1691

goto out_unlock;

1691

goto out_unlock;

1692

}

1692

}

1693

}

1693

}

1694

1695

/* work on a copy of ppos until we're sure that we won't have

1695

/* work on a copy of ppos until we're sure that we won't have

1696

* to recalculate it due to relocking. */

1696

* to recalculate it due to relocking. */

1697

if (appending) {

1697

if (appending) {

1698

saved_pos = i_size_read(inode);

1698

saved_pos = i_size_read(inode);

1699

mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);

1699

mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);

1700

} else {

1700

} else {

1701

saved_pos = *ppos;

1701

saved_pos = *ppos;

1702

}

1702

}

1703

1704

end = saved_pos + count;

1704

end = saved_pos + count;

1705

1706

/*

1706

/*

1707

* Skip the O_DIRECT checks if we don't need

1707

* Skip the O_DIRECT checks if we don't need

1708

* them.

1708

* them.

1709

*/

1709

*/

1710

if (!direct_io || !(*direct_io))

1710

if (!direct_io || !(*direct_io))

1711

break;

1711

break;

1712

1713

/*

1713

/*

1714

* There's no sane way to do direct writes to an inode

1714

* There's no sane way to do direct writes to an inode

1715

* with inline data.

1715

* with inline data.

1716

*/

1716

*/

1717

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1717

if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {

1718

*direct_io = 0;

1718

*direct_io = 0;

1719

break;

1719

break;

1720

}

1720

}

1721

1722

/*

1722

/*

1723

* Allowing concurrent direct writes means

1723

* Allowing concurrent direct writes means

1724

* i_size changes wouldn't be synchronized, so

1724

* i_size changes wouldn't be synchronized, so

1725

* one node could wind up truncating another

1725

* one node could wind up truncating another

1726

* nodes writes.

1726

* nodes writes.

1727

*/

1727

*/

1728

if (end > i_size_read(inode)) {

1728

if (end > i_size_read(inode)) {

1729

*direct_io = 0;

1729

*direct_io = 0;

1730

break;

1730

break;

1731

}

1731

}

1732

1733

/*

1733

/*

1734

* We don't fill holes during direct io, so

1734

* We don't fill holes during direct io, so

1735

* check for them here. If any are found, the

1735

* check for them here. If any are found, the

1736

* caller will have to retake some cluster

1736

* caller will have to retake some cluster

1737

* locks and initiate the io as buffered.

1737

* locks and initiate the io as buffered.

1738

*/

1738

*/

1739

ret = ocfs2_check_range_for_holes(inode, saved_pos, count);

1739

ret = ocfs2_check_range_for_holes(inode, saved_pos, count);

1740

if (ret == 1) {

1740

if (ret == 1) {

1741

*direct_io = 0;

1741

*direct_io = 0;

1742

ret = 0;

1742

ret = 0;

1743

} else if (ret < 0)

1743

} else if (ret < 0)

1744

mlog_errno(ret);

1744

mlog_errno(ret);

1745

break;

1745

break;

1746

}

1746

}

1747

1748

if (appending)

1748

if (appending)

1749

*ppos = saved_pos;

1749

*ppos = saved_pos;

1750

1751

out_unlock:

1751

out_unlock:

1752

ocfs2_inode_unlock(inode, meta_level);

1752

ocfs2_inode_unlock(inode, meta_level);

1753

1754

out:

1754

out:

1755

return ret;

1755

return ret;

1756

}

1756

}

1757

1758

static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,

1758

static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,

1759

const struct iovec *iov,

1759

const struct iovec *iov,

1760

unsigned long nr_segs,

1760

unsigned long nr_segs,

1761

loff_t pos)

1761

loff_t pos)

1762

{

1762

{

1763

int ret, direct_io, appending, rw_level, have_alloc_sem = 0;

1763

int ret, direct_io, appending, rw_level, have_alloc_sem = 0;

1764

int can_do_direct;

1764

int can_do_direct;

1765

ssize_t written = 0;

1765

ssize_t written = 0;

1766

size_t ocount; /* original count */

1766

size_t ocount; /* original count */

1767

size_t count; /* after file limit checks */

1767

size_t count; /* after file limit checks */

1768

loff_t old_size, *ppos = &iocb->ki_pos;

1768

loff_t old_size, *ppos = &iocb->ki_pos;

1769

u32 old_clusters;

1769

u32 old_clusters;

1770

struct file *file = iocb->ki_filp;

1770

struct file *file = iocb->ki_filp;

1771

struct inode *inode = file->f_path.dentry->d_inode;

1771

struct inode *inode = file->f_path.dentry->d_inode;

1772

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1772

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

1773

1774

mlog_entry("(0x%p, %u, '%.*s')\n", file,

1774

mlog_entry("(0x%p, %u, '%.*s')\n", file,

1775

(unsigned int)nr_segs,

1775

(unsigned int)nr_segs,

1776

file->f_path.dentry->d_name.len,

1776

file->f_path.dentry->d_name.len,

1777

file->f_path.dentry->d_name.name);

1777

file->f_path.dentry->d_name.name);

1778

1779

if (iocb->ki_left == 0)

1779

if (iocb->ki_left == 0)

1780

return 0;

1780

return 0;

1781

1782

vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);

1782

vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);

1783

1784

appending = file->f_flags & O_APPEND ? 1 : 0;

1784

appending = file->f_flags & O_APPEND ? 1 : 0;

1785

direct_io = file->f_flags & O_DIRECT ? 1 : 0;

1785

direct_io = file->f_flags & O_DIRECT ? 1 : 0;

1786

1787

mutex_lock(&inode->i_mutex);

1787

mutex_lock(&inode->i_mutex);

1788

1789

relock:

1789

relock:

1790

/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */

1790

/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */

1791

if (direct_io) {

1791

if (direct_io) {

1792

down_read(&inode->i_alloc_sem);

1792

down_read(&inode->i_alloc_sem);

1793

have_alloc_sem = 1;

1793

have_alloc_sem = 1;

1794

}

1794

}

1795

1796

/* concurrent O_DIRECT writes are allowed */

1796

/* concurrent O_DIRECT writes are allowed */

1797

rw_level = !direct_io;

1797

rw_level = !direct_io;

1798

ret = ocfs2_rw_lock(inode, rw_level);

1798

ret = ocfs2_rw_lock(inode, rw_level);

1799

if (ret < 0) {

1799

if (ret < 0) {

1800

mlog_errno(ret);

1800

mlog_errno(ret);

1801

goto out_sems;

1801

goto out_sems;

1802

}

1802

}

1803

1804

can_do_direct = direct_io;

1804

can_do_direct = direct_io;

1805

ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,

1805

ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,

1806

iocb->ki_left, appending,

1806

iocb->ki_left, appending,

1807

&can_do_direct);

1807

&can_do_direct);

1808

if (ret < 0) {

1808

if (ret < 0) {

1809

mlog_errno(ret);

1809

mlog_errno(ret);

1810

goto out;

1810

goto out;

1811

}

1811

}

1812

1813

/*

1813

/*

1814

* We can't complete the direct I/O as requested, fall back to

1814

* We can't complete the direct I/O as requested, fall back to

1815

* buffered I/O.

1815

* buffered I/O.

1816

*/

1816

*/

1817

if (direct_io && !can_do_direct) {

1817

if (direct_io && !can_do_direct) {

1818

ocfs2_rw_unlock(inode, rw_level);

1818

ocfs2_rw_unlock(inode, rw_level);

1819

up_read(&inode->i_alloc_sem);

1819

up_read(&inode->i_alloc_sem);

1820

1821

have_alloc_sem = 0;

1821

have_alloc_sem = 0;

1822

rw_level = -1;

1822

rw_level = -1;

1823

1824

direct_io = 0;

1824

direct_io = 0;

1825

goto relock;

1825

goto relock;

1826

}

1826

}

1827

1828

/*

1828

/*

1829

* To later detect whether a journal commit for sync writes is

1829

* To later detect whether a journal commit for sync writes is

1830

* necessary, we sample i_size, and cluster count here.

1830

* necessary, we sample i_size, and cluster count here.

1831

*/

1831

*/

1832

old_size = i_size_read(inode);

1832

old_size = i_size_read(inode);

1833

old_clusters = OCFS2_I(inode)->ip_clusters;

1833

old_clusters = OCFS2_I(inode)->ip_clusters;

1834

1835

/* communicate with ocfs2_dio_end_io */

1835

/* communicate with ocfs2_dio_end_io */

1836

ocfs2_iocb_set_rw_locked(iocb, rw_level);

1836

ocfs2_iocb_set_rw_locked(iocb, rw_level);

1837

1838

if (direct_io) {

1838

if (direct_io) {

1839

ret = generic_segment_checks(iov, &nr_segs, &ocount,

1839

ret = generic_segment_checks(iov, &nr_segs, &ocount,

1840

VERIFY_READ);

1840

VERIFY_READ);

1841

if (ret)

1841

if (ret)

1842

goto out_dio;

1842

goto out_dio;

1843

1844

ret = generic_write_checks(file, ppos, &count,

1844

ret = generic_write_checks(file, ppos, &count,

1845

S_ISBLK(inode->i_mode));

1845

S_ISBLK(inode->i_mode));

1846

if (ret)

1846

if (ret)

1847

goto out_dio;

1847

goto out_dio;

1848

1849

written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,

1849

written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,

1850

ppos, count, ocount);

1850

ppos, count, ocount);

1851

if (written < 0) {

1851

if (written < 0) {

1852

/*

1852

/*

1853

* direct write may have instantiated a few

1853

* direct write may have instantiated a few

1854

* blocks outside i_size. Trim these off again.

1854

* blocks outside i_size. Trim these off again.

1855

* Don't need i_size_read because we hold i_mutex.

1855

* Don't need i_size_read because we hold i_mutex.

1856

*/

1856

*/

1857

if (*ppos + count > inode->i_size)

1857

if (*ppos + count > inode->i_size)

1858

vmtruncate(inode, inode->i_size);

1858

vmtruncate(inode, inode->i_size);

1859

ret = written;

1859

ret = written;

1860

goto out_dio;

1860

goto out_dio;

1861

}

1861

}

1862

} else {

1862

} else {

1863

written = generic_file_aio_write_nolock(iocb, iov, nr_segs,

1863

written = generic_file_aio_write_nolock(iocb, iov, nr_segs,

1864

*ppos);

1864

*ppos);

1865

}

1865

}

1866

1867

out_dio:

1867

out_dio:

1868

/* buffered aio wouldn't have proper lock coverage today */

1868

/* buffered aio wouldn't have proper lock coverage today */

1869

BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));

1869

BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));

1870

1871

if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {

1871

if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {

1872

/*

1872

/*

1873

* The generic write paths have handled getting data

1873

* The generic write paths have handled getting data

1874

* to disk, but since we don't make use of the dirty

1874

* to disk, but since we don't make use of the dirty

1875

* inode list, a manual journal commit is necessary

1875

* inode list, a manual journal commit is necessary

1876

* here.

1876

* here.

1877

*/

1877

*/

1878

if (old_size != i_size_read(inode) ||

1878

if (old_size != i_size_read(inode) ||

1879

old_clusters != OCFS2_I(inode)->ip_clusters) {

1879

old_clusters != OCFS2_I(inode)->ip_clusters) {

1880

ret = jbd2_journal_force_commit(osb->journal->j_journal);

1880

ret = jbd2_journal_force_commit(osb->journal->j_journal);

1881

if (ret < 0)

1881

if (ret < 0)

1882

written = ret;

1882

written = ret;

1883

}

1883

}

1884

}

1884

}

1885

1886

/*

1886

/*

1887

* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io

1887

* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io

1888

* function pointer which is called when o_direct io completes so that

1888

* function pointer which is called when o_direct io completes so that

1889

* it can unlock our rw lock. (it's the clustered equivalent of

1889

* it can unlock our rw lock. (it's the clustered equivalent of

1890

* i_alloc_sem; protects truncate from racing with pending ios).

1890

* i_alloc_sem; protects truncate from racing with pending ios).

1891

* Unfortunately there are error cases which call end_io and others

1891

* Unfortunately there are error cases which call end_io and others

1892

* that don't. so we don't have to unlock the rw_lock if either an

1892

* that don't. so we don't have to unlock the rw_lock if either an

1893

* async dio is going to do it in the future or an end_io after an

1893

* async dio is going to do it in the future or an end_io after an

1894

* error has already done it.

1894

* error has already done it.

1895

*/

1895

*/

1896

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1896

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1897

rw_level = -1;

1897

rw_level = -1;

1898

have_alloc_sem = 0;

1898

have_alloc_sem = 0;

1899

}

1899

}

1900

1901

out:

1901

out:

1902

if (rw_level != -1)

1902

if (rw_level != -1)

1903

ocfs2_rw_unlock(inode, rw_level);

1903

ocfs2_rw_unlock(inode, rw_level);

1904

1905

out_sems:

1905

out_sems:

1906

if (have_alloc_sem)

1906

if (have_alloc_sem)

1907

up_read(&inode->i_alloc_sem);

1907

up_read(&inode->i_alloc_sem);

1908

1909

mutex_unlock(&inode->i_mutex);

1909

mutex_unlock(&inode->i_mutex);

1910

1911

mlog_exit(ret);

1911

mlog_exit(ret);

1912

return written ? written : ret;

1912

return written ? written : ret;

1913

}

1913

}

1914

1915

static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,

1915

static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,

1916

struct file *out,

1916

struct file *out,

1917

loff_t *ppos,

1917

loff_t *ppos,

1918

size_t len,

1918

size_t len,

1919

unsigned int flags)

1919

unsigned int flags)

1920

{

1920

{

1921

int ret;

1921

int ret;

1922

struct inode *inode = out->f_path.dentry->d_inode;

1922

struct inode *inode = out->f_path.dentry->d_inode;

1923

1924

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,

1924

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,

1925

(unsigned int)len,

1925

(unsigned int)len,

1926

out->f_path.dentry->d_name.len,

1926

out->f_path.dentry->d_name.len,

1927

out->f_path.dentry->d_name.name);

1927

out->f_path.dentry->d_name.name);

1928

1929

inode_double_lock(inode, pipe->inode);

1929

mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);

1930

1931

ret = ocfs2_rw_lock(inode, 1);

1931

ret = ocfs2_rw_lock(inode, 1);

1932

if (ret < 0) {

1932

if (ret < 0) {

1933

mlog_errno(ret);

1933

mlog_errno(ret);

1934

goto out;

1934

goto out;

1935

}

1935

}

1936

1937

ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,

1937

ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,

1938

NULL);

1938

NULL);

1939

if (ret < 0) {

1939

if (ret < 0) {

1940

mlog_errno(ret);

1940

mlog_errno(ret);

1941

goto out_unlock;

1941

goto out_unlock;

1942

}

1942

}

1943

1944

if (pipe->inode)

1945

mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);

1944

ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);

1946

ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);

1947

if (pipe->inode)

1948

mutex_unlock(&pipe->inode->i_mutex);

1945

1949

1946

out_unlock:

1950

out_unlock:

1947

ocfs2_rw_unlock(inode, 1);

1951

ocfs2_rw_unlock(inode, 1);

1948

out:

1952

out:

1949

inode_double_unlock(inode, pipe->inode);

1953

mutex_unlock(&inode->i_mutex);

1950

1954

1951

mlog_exit(ret);

1955

mlog_exit(ret);

1952

return ret;

1956

return ret;

1953

}

1957

}

1954

1958

1955

static ssize_t ocfs2_file_splice_read(struct file *in,

1959

static ssize_t ocfs2_file_splice_read(struct file *in,

1956

loff_t *ppos,

1960

loff_t *ppos,

1957

struct pipe_inode_info *pipe,

1961

struct pipe_inode_info *pipe,

1958

size_t len,

1962

size_t len,

1959

unsigned int flags)

1963

unsigned int flags)

1960

{

1964

{

1961

int ret = 0;

1965

int ret = 0;

1962

struct inode *inode = in->f_path.dentry->d_inode;

1966

struct inode *inode = in->f_path.dentry->d_inode;

1963

1967

1964

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,

1968

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,

1965

(unsigned int)len,

1969

(unsigned int)len,

1966

in->f_path.dentry->d_name.len,

1970

in->f_path.dentry->d_name.len,

1967

in->f_path.dentry->d_name.name);

1971

in->f_path.dentry->d_name.name);

1968

1972

1969

/*

1973

/*

1970

* See the comment in ocfs2_file_aio_read()

1974

* See the comment in ocfs2_file_aio_read()

1971

*/

1975

*/

1972

ret = ocfs2_inode_lock(inode, NULL, 0);

1976

ret = ocfs2_inode_lock(inode, NULL, 0);

1973

if (ret < 0) {

1977

if (ret < 0) {

1974

mlog_errno(ret);

1978

mlog_errno(ret);

1975

goto bail;

1979

goto bail;

1976

}

1980

}

1977

ocfs2_inode_unlock(inode, 0);

1981

ocfs2_inode_unlock(inode, 0);

1978

1982

1979

ret = generic_file_splice_read(in, ppos, pipe, len, flags);

1983

ret = generic_file_splice_read(in, ppos, pipe, len, flags);

1980

1984

1981

bail:

1985

bail:

1982

mlog_exit(ret);

1986

mlog_exit(ret);

1983

return ret;

1987

return ret;

1984

}

1988

}

1985

1989

1986

static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,

1990

static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,

1987

const struct iovec *iov,

1991

const struct iovec *iov,

1988

unsigned long nr_segs,

1992

unsigned long nr_segs,

1989

loff_t pos)

1993

loff_t pos)

1990

{

1994

{

1991

int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;

1995

int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;

1992

struct file *filp = iocb->ki_filp;

1996

struct file *filp = iocb->ki_filp;

1993

struct inode *inode = filp->f_path.dentry->d_inode;

1997

struct inode *inode = filp->f_path.dentry->d_inode;

1994

1998

1995

mlog_entry("(0x%p, %u, '%.*s')\n", filp,

1999

mlog_entry("(0x%p, %u, '%.*s')\n", filp,

1996

(unsigned int)nr_segs,

2000

(unsigned int)nr_segs,

1997

filp->f_path.dentry->d_name.len,

2001

filp->f_path.dentry->d_name.len,

1998

filp->f_path.dentry->d_name.name);

2002

filp->f_path.dentry->d_name.name);

1999

2003

2000

if (!inode) {

2004

if (!inode) {

2001

ret = -EINVAL;

2005

ret = -EINVAL;

2002

mlog_errno(ret);

2006

mlog_errno(ret);

2003

goto bail;

2007

goto bail;

2004

}

2008

}

2005

2009

2006

/*

2010

/*

2007

* buffered reads protect themselves in ->readpage(). O_DIRECT reads

2011

* buffered reads protect themselves in ->readpage(). O_DIRECT reads

2008

* need locks to protect pending reads from racing with truncate.

2012

* need locks to protect pending reads from racing with truncate.

2009

*/

2013

*/

2010

if (filp->f_flags & O_DIRECT) {

2014

if (filp->f_flags & O_DIRECT) {

2011

down_read(&inode->i_alloc_sem);

2015

down_read(&inode->i_alloc_sem);

2012

have_alloc_sem = 1;

2016

have_alloc_sem = 1;

2013

2017

2014

ret = ocfs2_rw_lock(inode, 0);

2018

ret = ocfs2_rw_lock(inode, 0);

2015

if (ret < 0) {

2019

if (ret < 0) {

2016

mlog_errno(ret);

2020

mlog_errno(ret);

2017

goto bail;

2021

goto bail;

2018

}

2022

}

2019

rw_level = 0;

2023

rw_level = 0;

2020

/* communicate with ocfs2_dio_end_io */

2024

/* communicate with ocfs2_dio_end_io */

2021

ocfs2_iocb_set_rw_locked(iocb, rw_level);

2025

ocfs2_iocb_set_rw_locked(iocb, rw_level);

2022

}

2026

}

2023

2027

2024

/*

2028

/*

2025

* We're fine letting folks race truncates and extending

2029

* We're fine letting folks race truncates and extending

2026

* writes with read across the cluster, just like they can

2030

* writes with read across the cluster, just like they can

2027

* locally. Hence no rw_lock during read.

2031

* locally. Hence no rw_lock during read.

2028

*

2032

*

2029

* Take and drop the meta data lock to update inode fields

2033

* Take and drop the meta data lock to update inode fields

2030

* like i_size. This allows the checks down below

2034

* like i_size. This allows the checks down below

2031

* generic_file_aio_read() a chance of actually working.

2035

* generic_file_aio_read() a chance of actually working.

2032

*/

2036

*/

2033

ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);

2037

ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);

2034

if (ret < 0) {

2038

if (ret < 0) {

2035

mlog_errno(ret);

2039

mlog_errno(ret);

2036

goto bail;

2040

goto bail;

2037

}

2041

}

2038

ocfs2_inode_unlock(inode, lock_level);

2042

ocfs2_inode_unlock(inode, lock_level);

2039

2043

2040

ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);

2044

ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);

2041

if (ret == -EINVAL)

2045

if (ret == -EINVAL)

2042

mlog(0, "generic_file_aio_read returned -EINVAL\n");

2046

mlog(0, "generic_file_aio_read returned -EINVAL\n");

2043

2047

2044

/* buffered aio wouldn't have proper lock coverage today */

2048

/* buffered aio wouldn't have proper lock coverage today */

2045

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

2049

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

2046

2050

2047

/* see ocfs2_file_aio_write */

2051

/* see ocfs2_file_aio_write */

2048

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

2052

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

2049

rw_level = -1;

2053

rw_level = -1;

2050

have_alloc_sem = 0;

2054

have_alloc_sem = 0;

2051

}

2055

}

2052

2056

2053

bail:

2057

bail:

2054

if (have_alloc_sem)

2058

if (have_alloc_sem)

2055

up_read(&inode->i_alloc_sem);

2059

up_read(&inode->i_alloc_sem);

2056

if (rw_level != -1)

2060

if (rw_level != -1)

2057

ocfs2_rw_unlock(inode, rw_level);

2061

ocfs2_rw_unlock(inode, rw_level);

2058

mlog_exit(ret);

2062

mlog_exit(ret);

2059

2063

2060

return ret;

2064

return ret;

2061

}

2065

}

2062

2066

2063

const struct inode_operations ocfs2_file_iops = {

2067

const struct inode_operations ocfs2_file_iops = {

2064

.setattr = ocfs2_setattr,

2068

.setattr = ocfs2_setattr,

2065

.getattr = ocfs2_getattr,

2069

.getattr = ocfs2_getattr,

2066

.permission = ocfs2_permission,

2070

.permission = ocfs2_permission,

2067

.setxattr = generic_setxattr,

2071

.setxattr = generic_setxattr,

2068

.getxattr = generic_getxattr,

2072

.getxattr = generic_getxattr,

2069

.listxattr = ocfs2_listxattr,

2073

.listxattr = ocfs2_listxattr,

2070

.removexattr = generic_removexattr,

2074

.removexattr = generic_removexattr,

2071

.fallocate = ocfs2_fallocate,

2075

.fallocate = ocfs2_fallocate,

2072

.fiemap = ocfs2_fiemap,

2076

.fiemap = ocfs2_fiemap,

2073

};

2077

};

2074

2078

2075

const struct inode_operations ocfs2_special_file_iops = {

2079

const struct inode_operations ocfs2_special_file_iops = {

2076

.setattr = ocfs2_setattr,

2080

.setattr = ocfs2_setattr,

2077

.getattr = ocfs2_getattr,

2081

.getattr = ocfs2_getattr,

2078

.permission = ocfs2_permission,

2082

.permission = ocfs2_permission,

2079

};

2083

};

2080

2084

2081

/*

2085

/*

2082

* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with

2086

* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with

2083

* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!

2087

* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!

2084

*/

2088

*/

2085

const struct file_operations ocfs2_fops = {

2089

const struct file_operations ocfs2_fops = {

2086

.llseek = generic_file_llseek,

2090

.llseek = generic_file_llseek,

2087

.read = do_sync_read,

2091

.read = do_sync_read,

2088

.write = do_sync_write,

2092

.write = do_sync_write,

2089

.mmap = ocfs2_mmap,

2093

.mmap = ocfs2_mmap,

2090

.fsync = ocfs2_sync_file,

2094

.fsync = ocfs2_sync_file,

2091

.release = ocfs2_file_release,

2095

.release = ocfs2_file_release,

2092

.open = ocfs2_file_open,

2096

.open = ocfs2_file_open,

2093

.aio_read = ocfs2_file_aio_read,

2097

.aio_read = ocfs2_file_aio_read,

2094

.aio_write = ocfs2_file_aio_write,

2098

.aio_write = ocfs2_file_aio_write,

2095

.unlocked_ioctl = ocfs2_ioctl,

2099

.unlocked_ioctl = ocfs2_ioctl,

2096

#ifdef CONFIG_COMPAT

2100

#ifdef CONFIG_COMPAT

2097

.compat_ioctl = ocfs2_compat_ioctl,

2101

.compat_ioctl = ocfs2_compat_ioctl,

2098

#endif

2102

#endif

2099

.lock = ocfs2_lock,

2103

.lock = ocfs2_lock,

2100

.flock = ocfs2_flock,

2104

.flock = ocfs2_flock,

2101

.splice_read = ocfs2_file_splice_read,

2105

.splice_read = ocfs2_file_splice_read,

2102

.splice_write = ocfs2_file_splice_write,

2106

.splice_write = ocfs2_file_splice_write,

2103

};

2107

};

2104

2108

2105

const struct file_operations ocfs2_dops = {

2109

const struct file_operations ocfs2_dops = {

2106

.llseek = generic_file_llseek,

2110

.llseek = generic_file_llseek,

2107

.read = generic_read_dir,

2111

.read = generic_read_dir,

2108

.readdir = ocfs2_readdir,

2112

.readdir = ocfs2_readdir,

2109

.fsync = ocfs2_sync_file,

2113

.fsync = ocfs2_sync_file,

2110

.release = ocfs2_dir_release,

2114

.release = ocfs2_dir_release,

2111

.open = ocfs2_dir_open,

2115

.open = ocfs2_dir_open,

2112

.unlocked_ioctl = ocfs2_ioctl,

2116

.unlocked_ioctl = ocfs2_ioctl,

2113

#ifdef CONFIG_COMPAT

2117

#ifdef CONFIG_COMPAT

2114

.compat_ioctl = ocfs2_compat_ioctl,

2118

.compat_ioctl = ocfs2_compat_ioctl,

2115

#endif

2119

#endif

2116

.lock = ocfs2_lock,

2120

.lock = ocfs2_lock,

2117

.flock = ocfs2_flock,

2121

.flock = ocfs2_flock,

2118

};

2122

};

2119

2123

2120

/*

2124

/*

2121

* POSIX-lockless variants of our file_operations.

2125

* POSIX-lockless variants of our file_operations.

2122

*

2126

*

2123

* These will be used if the underlying cluster stack does not support

2127

* These will be used if the underlying cluster stack does not support

2124

* posix file locking, if the user passes the "localflocks" mount

2128

* posix file locking, if the user passes the "localflocks" mount

2125

* option, or if we have a local-only fs.

2129

* option, or if we have a local-only fs.

2126

*

2130

*

2127

* ocfs2_flock is in here because all stacks handle UNIX file locks,

2131

* ocfs2_flock is in here because all stacks handle UNIX file locks,

2128

* so we still want it in the case of no stack support for

2132

* so we still want it in the case of no stack support for

2129

* plocks. Internally, it will do the right thing when asked to ignore

2133

* plocks. Internally, it will do the right thing when asked to ignore

2130

* the cluster.

2134

* the cluster.

2131

*/

2135

*/

2132

const struct file_operations ocfs2_fops_no_plocks = {

2136

const struct file_operations ocfs2_fops_no_plocks = {

2133

.llseek = generic_file_llseek,

2137

.llseek = generic_file_llseek,

2134

.read = do_sync_read,

2138

.read = do_sync_read,

2135

.write = do_sync_write,

2139

.write = do_sync_write,

2136

.mmap = ocfs2_mmap,

2140

.mmap = ocfs2_mmap,

2137

.fsync = ocfs2_sync_file,

2141

.fsync = ocfs2_sync_file,

2138

.release = ocfs2_file_release,

2142

.release = ocfs2_file_release,

2139

.open = ocfs2_file_open,

2143

.open = ocfs2_file_open,

2140

.aio_read = ocfs2_file_aio_read,

2144

.aio_read = ocfs2_file_aio_read,

2141

.aio_write = ocfs2_file_aio_write,

2145

.aio_write = ocfs2_file_aio_write,

2142

.unlocked_ioctl = ocfs2_ioctl,

2146

.unlocked_ioctl = ocfs2_ioctl,

2143

#ifdef CONFIG_COMPAT

2147

#ifdef CONFIG_COMPAT

2144

.compat_ioctl = ocfs2_compat_ioctl,

2148

.compat_ioctl = ocfs2_compat_ioctl,

2145

#endif

2149

#endif

2146

.flock = ocfs2_flock,

2150

.flock = ocfs2_flock,

2147

.splice_read = ocfs2_file_splice_read,

2151

.splice_read = ocfs2_file_splice_read,

2148

.splice_write = ocfs2_file_splice_write,

2152

.splice_write = ocfs2_file_splice_write,

2149

};

2153

};

2150

2154

2151

const struct file_operations ocfs2_dops_no_plocks = {

2155

const struct file_operations ocfs2_dops_no_plocks = {

2152

.llseek = generic_file_llseek,

2156

.llseek = generic_file_llseek,

2153

.read = generic_read_dir,

2157

.read = generic_read_dir,

2154

.readdir = ocfs2_readdir,

2158

.readdir = ocfs2_readdir,

2155

.fsync = ocfs2_sync_file,

2159

.fsync = ocfs2_sync_file,

2156

.release = ocfs2_dir_release,

2160

.release = ocfs2_dir_release,

2157

.open = ocfs2_dir_open,

2161

.open = ocfs2_dir_open,

2158

.unlocked_ioctl = ocfs2_ioctl,

2162

.unlocked_ioctl = ocfs2_ioctl,

2159

#ifdef CONFIG_COMPAT

2163

#ifdef CONFIG_COMPAT

2160

.compat_ioctl = ocfs2_compat_ioctl,

2164

.compat_ioctl = ocfs2_compat_ioctl,

2161

#endif

2165

#endif

2162

.flock = ocfs2_flock,

2166

.flock = ocfs2_flock,

2163

};

2167

};

2164

2168

GITLAB

splice: fix deadlock in splicing to file

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * file.c
  *
  * File open, close, extend, truncate
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
 #include <linux/splice.h>
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
 #include "locks.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
 #include "buffer_head_io.h"
 static int ocfs2_sync_inode(struct inode *inode)
 {
 	filemap_fdatawrite(inode->i_mapping);
 	return sync_mapping_buffers(inode->i_mapping);
 }
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp;
 	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 	fp->fp_file = file;
 	mutex_init(&fp->fp_mutex);
 	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
 	file->private_data = fp;
 	return 0;
 }
 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
 {
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if (fp) {
 		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
 		ocfs2_lock_res_free(&fp->fp_flock);
 		kfree(fp);
 		file->private_data = NULL;
 	}
 }
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
 	int status;
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 	spin_lock(&oi->ip_lock);
 	/* Check that the inode hasn't been wiped from disk by another
 	 * node. If it hasn't then we're safe as long as we hold the
 	 * spin lock until our increment of open count. */
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&oi->ip_lock);
 		status = -ENOENT;
 		goto leave;
 	}
 	if (mode & O_DIRECT)
 		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 	oi->ip_open_count++;
 	spin_unlock(&oi->ip_lock);
 	status = ocfs2_init_file_private(inode, file);
 	if (status) {
 		/*
 		 * We want to set open count back if we're failing the
 		 * open.
 		 */
 		spin_lock(&oi->ip_lock);
 		oi->ip_open_count--;
 		spin_unlock(&oi->ip_lock);
 	}
 leave:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 		       file->f_path.dentry->d_name.len,
 		       file->f_path.dentry->d_name.name);
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 	spin_unlock(&oi->ip_lock);
 	ocfs2_free_file_private(inode, file);
 	mlog_exit(0);
 	return 0;
 }
 static int ocfs2_dir_open(struct inode *inode, struct file *file)
 {
 	return ocfs2_init_file_private(inode, file);
 }
 static int ocfs2_dir_release(struct inode *inode, struct file *file)
 {
 	ocfs2_free_file_private(inode, file);
 	return 0;
 }
 static int ocfs2_sync_file(struct file *file,
 			   struct dentry *dentry,
 			   int datasync)
 {
 	int err = 0;
 	journal_t *journal;
 	struct inode *inode = dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
 		   dentry->d_name.len, dentry->d_name.name);
 	err = ocfs2_sync_inode(dentry->d_inode);
 	if (err)
 		goto bail;
 	journal = osb->journal->j_journal;
 	err = jbd2_journal_force_commit(journal);
 bail:
 	mlog_exit(err);
 	return (err < 0) ? -EIO : 0;
 }
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt)
 {
 	struct timespec now;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return 0;
 	if ((inode->i_flags & S_NOATIME) ||
 	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	/*
 	 * We can be called with no vfsmnt structure - NFSD will
 	 * sometimes do this.
 	 *
 	 * Note that our action here is different than touch_atime() -
 	 * if we can't tell whether this is a noatime mount, then we
 	 * don't know whether to trust the value of s_atime_quantum.
 	 */
 	if (vfsmnt == NULL)
 		return 0;
 	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return 0;
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
 		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 			return 1;
 		return 0;
 	}
 	now = CURRENT_TIME;
 	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 		return 0;
 	else
 		return 1;
 }
 int ocfs2_update_inode_atime(struct inode *inode,
 			     struct buffer_head *bh)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 	mlog_entry_void();
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_journal_access_di(handle, inode, bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	/*
 	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
 	 * have i_mutex to guard against concurrent changes to other
 	 * inode fields.
 	 */
 	inode->i_atime = CURRENT_TIME;
 	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 	ret = ocfs2_journal_dirty(handle, bh);
 	if (ret < 0)
 		mlog_errno(ret);
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static int ocfs2_set_inode_size(handle_t *handle,
 				struct inode *inode,
 				struct buffer_head *fe_bh,
 				u64 new_i_size)
 {
 	int status;
 	mlog_entry_void();
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 bail:
 	mlog_exit(status);
 	return status;
 }
 int ocfs2_simple_size_update(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_set_inode_size(handle, inode, di_bh,
 				   new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
 				     u64 new_i_size)
 {
 	int status;
 	handle_t *handle;
 	struct ocfs2_dinode *di;
 	u64 cluster_bytes;
 	mlog_entry_void();
 	/* TODO: This needs to actually orphan the inode in this
 	 * transaction. */
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out;
 	}
 	status = ocfs2_journal_access_di(handle, inode, fe_bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	/*
 	 * Do this before setting i_size.
 	 */
 	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 					       cluster_bytes);
 	if (status) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	i_size_write(inode, new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(status);
 	return status;
 }
 static int ocfs2_truncate_file(struct inode *inode,
 			       struct buffer_head *di_bh,
 			       u64 new_i_size)
 {
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_truncate_context *tc = NULL;
 	mlog_entry("(inode = %llu, new_i_size = %llu\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
 	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"i_size = %llu, i_flags = 0x%x\n",
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			i_size_read(inode),
 			(unsigned long long)le64_to_cpu(fe->i_size),
 			le32_to_cpu(fe->i_flags));
 	if (new_i_size > le64_to_cpu(fe->i_size)) {
 		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
 		     (unsigned long long)le64_to_cpu(fe->i_size),
 		     (unsigned long long)new_i_size);
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
 	     (unsigned long long)le64_to_cpu(fe->i_blkno),
 	     (unsigned long long)le64_to_cpu(fe->i_size),
 	     (unsigned long long)new_i_size);
 	/* lets handle the simple truncate cases before doing any more
 	 * cluster locking. */
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	/*
 	 * The inode lock forced other nodes to sync and drop their
 	 * pages, which (correctly) happens even if we have a truncate
 	 * without allocation change - ocfs2 cluster sizes can be much
 	 * greater than page size, so we have to truncate them
 	 * anyway.
 	 */
 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 					       i_size_read(inode), 1);
 		if (status)
 			mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
 	 * i_size. */
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
 	}
 	/* TODO: orphan dir cleanup here. */
 bail_unlock_sem:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
 	mlog_exit(status);
 	return status;
 }
 /*
  * extend file allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
  * metadata reservations in the contexts.
  *
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
 int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct inode *inode,
 			 u32 *logical_offset,
 			 u32 clusters_to_add,
 			 int mark_unwritten,
 			 struct buffer_head *fe_bh,
 			 handle_t *handle,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret)
 {
 	int ret;
 	struct ocfs2_extent_tree et;
 	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
 	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
 					   clusters_to_add, mark_unwritten,
 					   &et, handle,
 					   data_ac, meta_ac, reason_ret);
 	return ret;
 }
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 				     u32 clusters_to_add, int mark_unwritten)
 {
 	int status = 0;
 	int restart_func = 0;
 	int credits;
 	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 	/*
 	 * This function only exists for file systems which don't
 	 * support holes.
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 	     "clusters_to_add = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
 	     clusters_to_add);
 	ocfs2_init_dinode_extent_tree(&et, inode, bh);
 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
 					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
 		mlog_errno(status);
 		goto leave;
 	}
 restarted_transaction:
 	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
 	    clusters_to_add))) {
 		status = -EDQUOT;
 		goto leave;
 	}
 	did_quota = 1;
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
 	status = ocfs2_journal_access_di(handle, inode, bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 	status = ocfs2_add_inode_data(osb,
 				      inode,
 				      &logical_start,
 				      clusters_to_add,
 				      mark_unwritten,
 				      bh,
 				      handle,
 				      data_ac,
 				      meta_ac,
 				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto leave;
 	}
 	status = ocfs2_journal_dirty(handle, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	/* Release unused quota reservation */
 	vfs_dq_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	did_quota = 0;
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 			mlog(0, "restarting function.\n");
 			restart_func = 1;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
 							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
 				/* handle still has to be committed at
 				 * this point. */
 				status = -ENOMEM;
 				mlog_errno(status);
 				goto leave;
 			}
 			goto restarted_transaction;
 		}
 	}
 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
 	     le32_to_cpu(fe->i_clusters),
 	     (unsigned long long)le64_to_cpu(fe->i_size));
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
 	if (status < 0 && did_quota)
 		vfs_dq_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 	}
 	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
 		data_ac = NULL;
 	}
 	if (meta_ac) {
 		ocfs2_free_alloc_context(meta_ac);
 		meta_ac = NULL;
 	}
 	if ((!status) && restart_func) {
 		restart_func = 0;
 		goto restart_all;
 	}
 	brelse(bh);
 	bh = NULL;
 	mlog_exit(status);
 	return status;
 }
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long index;
 	unsigned int offset;
 	handle_t *handle = NULL;
 	int ret;
 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
 	** skip the prepare.  make sure we never send an offset for the start
 	** of a block
 	*/
 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
 		offset++;
 	}
 	index = size >> PAGE_CACHE_SHIFT;
 	page = grab_cache_page(mapping, index);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 	if (ocfs2_should_order_data(inode)) {
 		handle = ocfs2_start_walk_page_trans(inode, page, offset,
 						     offset);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			handle = NULL;
 			goto out_unlock;
 		}
 	}
 	/* must not update i_size! */
 	ret = block_commit_write(page, offset, offset);
 	if (ret < 0)
 		mlog_errno(ret);
 	else
 		ret = 0;
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return ret;
 }
 static int ocfs2_zero_extend(struct inode *inode,
 			     u64 zero_to_size)
 {
 	int ret = 0;
 	u64 start_off;
 	struct super_block *sb = inode->i_sb;
 	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 	while (start_off < zero_to_size) {
 		ret = ocfs2_write_zero_page(inode, start_off);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		start_off += sb->s_blocksize;
 		/*
 		 * Very large extends have the potential to lock up
 		 * the cpu for extended periods of time.
 		 */
 		cond_resched();
 	}
 out:
 	return ret;
 }
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
 {
 	int ret;
 	u32 clusters_to_add;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
 	if (clusters_to_add < oi->ip_clusters)
 		clusters_to_add = 0;
 	else
 		clusters_to_add -= oi->ip_clusters;
 	if (clusters_to_add) {
 		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
 						clusters_to_add, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * Call this even if we don't add any clusters to the tree. We
 	 * still need to zero the area between the old i_size and the
 	 * new i_size.
 	 */
 	ret = ocfs2_zero_extend(inode, zero_to);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
 	return ret;
 }
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
 	int ret = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	BUG_ON(!di_bh);
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
 		goto out;
 	if (i_size_read(inode) == new_i_size)
   		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 	/*
 	 * Fall through for converting inline data, even if the fs
 	 * supports sparse files.
 	 *
 	 * The check for inline data here is legal - nobody can add
 	 * the feature since we have i_mutex. We must check it again
 	 * after acquiring ip_alloc_sem though, as paths like mmap
 	 * might have raced us to converting the inode to extents.
 	 */
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		goto out_update_size;
 	/*
 	 * The alloc sem blocks people in read/write from reading our
 	 * allocation until we're done changing it. We depend on
 	 * i_mutex to block other extend/truncate calls while we're
 	 * here.
 	 */
 	down_write(&oi->ip_alloc_sem);
 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		/*
 		 * We can optimize small extends by keeping the inodes
 		 * inline data.
 		 */
 		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
 			up_write(&oi->ip_alloc_sem);
 			goto out_update_size;
 		}
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			up_write(&oi->ip_alloc_sem);
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
 	up_write(&oi->ip_alloc_sem);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 out_update_size:
 	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
 	return ret;
 }
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0, size_change;
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
 	int locked[MAXQUOTAS] = {0, 0};
 	int credits, qtype;
 	struct ocfs2_mem_dqinfo *oinfo;
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
 		attr->ia_valid &= ~ATTR_SIZE;
 	if (attr->ia_valid & ATTR_MODE)
 		mlog(0, "mode change: %d\n", attr->ia_mode);
 	if (attr->ia_valid & ATTR_UID)
 		mlog(0, "uid change: %d\n", attr->ia_uid);
 	if (attr->ia_valid & ATTR_GID)
 		mlog(0, "gid change: %d\n", attr->ia_gid);
 	if (attr->ia_valid & ATTR_SIZE)
 		mlog(0, "size change...\n");
 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
 		mlog(0, "time change...\n");
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
 		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
 		return 0;
 	}
 	status = inode_change_ok(inode, attr);
 	if (status)
 		return status;
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 	}
 	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto bail_unlock_rw;
 	}
 	if (size_change && attr->ia_size != i_size_read(inode)) {
 		if (attr->ia_size > sb->s_maxbytes) {
 			status = -EFBIG;
 			goto bail_unlock;
 		}
 		if (i_size_read(inode) > attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
 				status = ocfs2_begin_ordered_truncate(inode,
 								      attr->ia_size);
 				if (status)
 					goto bail_unlock;
 			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
 			status = -ENOSPC;
 			goto bail_unlock;
 		}
 	}
 	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		credits = OCFS2_INODE_UPDATE_CREDITS;
 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
 			status = ocfs2_lock_global_qf(oinfo, 1);
 			if (status < 0)
 				goto bail_unlock;
 			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
 				ocfs2_calc_qdel_credits(sb, USRQUOTA);
 			locked[USRQUOTA] = 1;
 		}
 		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
 			status = ocfs2_lock_global_qf(oinfo, 1);
 			if (status < 0)
 				goto bail_unlock;
 			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
 				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
 			locked[GRPQUOTA] = 1;
 		}
 		handle = ocfs2_start_trans(osb, credits);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
 			goto bail_unlock;
 		}
 		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 		if (status < 0)
 			goto bail_commit;
 	} else {
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
 			goto bail_unlock;
 		}
 	}
 	/*
 	 * This will intentionally not wind up calling vmtruncate(),
 	 * since all the work for a size change has been done above.
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
 	 * changes.
 	 */
 	status = inode_setattr(inode, attr);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_commit;
 	}
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (status < 0)
 		mlog_errno(status);
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
 		if (!locked[qtype])
 			continue;
 		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
 		ocfs2_unlock_global_qf(oinfo, 1);
 	}
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
 	brelse(bh);
 	if (!status && attr->ia_valid & ATTR_MODE) {
 		status = ocfs2_acl_chmod(inode);
 		if (status < 0)
 			mlog_errno(status);
 	}
 	mlog_exit(status);
 	return status;
 }
 int ocfs2_getattr(struct vfsmount *mnt,
 		  struct dentry *dentry,
 		  struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	struct super_block *sb = dentry->d_inode->i_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 	mlog_entry_void();
 	err = ocfs2_inode_revalidate(dentry);
 	if (err) {
 		if (err != -ENOENT)
 			mlog_errno(err);
 		goto bail;
 	}
 	generic_fillattr(inode, stat);
 	/* We set the blksize from the cluster size for performance */
 	stat->blksize = osb->s_clustersize;
 bail:
 	mlog_exit(err);
 	return err;
 }
 int ocfs2_permission(struct inode *inode, int mask)
 {
 	int ret;
 	mlog_entry_void();
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
 			mlog_errno(ret);
 		goto out;
 	}
 	ret = generic_permission(inode, mask, ocfs2_check_acl);
 	ocfs2_inode_unlock(inode, 0);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 static int __ocfs2_write_remove_suid(struct inode *inode,
 				     struct buffer_head *bh)
 {
 	int ret;
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 	mlog_entry("(Inode %llu, mode 0%o)\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_journal_access_di(handle, inode, bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_trans;
 	}
 	inode->i_mode &= ~S_ISUID;
 	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
 		inode->i_mode &= ~S_ISGID;
 	di = (struct ocfs2_dinode *) bh->b_data;
 	di->i_mode = cpu_to_le16(inode->i_mode);
 	ret = ocfs2_journal_dirty(handle, bh);
 	if (ret < 0)
 		mlog_errno(ret);
 out_trans:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 /*
  * Will look for holes and unwritten extents in the range starting at
  * pos for count bytes (inclusive).
  */
 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
 				       size_t count)
 {
 	int ret = 0;
 	unsigned int extent_flags;
 	u32 cpos, clusters, extent_len, phys_cpos;
 	struct super_block *sb = inode->i_sb;
 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
 					 &extent_flags);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
 			ret = 1;
 			break;
 		}
 		if (extent_len > clusters)
 			extent_len = clusters;
 		clusters -= extent_len;
 		cpos += extent_len;
 	}
 out:
 	return ret;
 }
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
 	ret = ocfs2_read_inode_block(inode, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret =  __ocfs2_write_remove_suid(inode, bh);
 out:
 	brelse(bh);
 	return ret;
 }
 /*
  * Allocate enough extents to cover the region starting at byte offset
  * start for len bytes. Existing extents are skipped, any extents
  * added are marked as "unwritten".
  */
 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 					    u64 start, u64 len)
 {
 	int ret;
 	u32 cpos, phys_cpos, clusters, alloc_size;
 	u64 end = start + len;
 	struct buffer_head *di_bh = NULL;
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_read_inode_block(inode, &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * Nothing to do if the requested reservation range
 		 * fits within the inode.
 		 */
 		if (ocfs2_size_fits_inline_data(di_bh, end))
 			goto out;
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 	/*
 	 * We consider both start and len to be inclusive.
 	 */
 	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
 	clusters -= cpos;
 	while (clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
 					 &alloc_size, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * Hole or existing extent len can be arbitrary, so
 		 * cap it to our own allocation request.
 		 */
 		if (alloc_size > clusters)
 			alloc_size = clusters;
 		if (phys_cpos) {
 			/*
 			 * We already have an allocation at this
 			 * region so we can safely skip it.
 			 */
 			goto next;
 		}
 		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
 		if (ret) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out;
 		}
 next:
 		cpos += alloc_size;
 		clusters -= alloc_size;
 	}
 	ret = 0;
 out:
 	brelse(di_bh);
 	return ret;
 }
 /*
  * Truncate a byte range, avoiding pages within partial clusters. This
  * preserves those pages for the zeroing code to write to.
  */
 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
 					 u64 byte_len)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	loff_t start, end;
 	struct address_space *mapping = inode->i_mapping;
 	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
 	end = byte_start + byte_len;
 	end = end & ~(osb->s_clustersize - 1);
 	if (start < end) {
 		unmap_mapping_range(mapping, start, end - start, 0);
 		truncate_inode_pages_range(mapping, start, end - 1);
 	}
 }
 static int ocfs2_zero_partial_clusters(struct inode *inode,
 				       u64 start, u64 len)
 {
 	int ret = 0;
 	u64 tmpend, end = start + len;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	unsigned int csize = osb->s_clustersize;
 	handle_t *handle;
 	/*
 	 * The "start" and "end" values are NOT necessarily part of
 	 * the range whose allocation is being deleted. Rather, this
 	 * is what the user passed in with the request. We must zero
 	 * partial clusters here. There's no need to worry about
 	 * physical allocation - the zeroing code knows to skip holes.
 	 */
 	mlog(0, "byte start: %llu, end: %llu\n",
 	     (unsigned long long)start, (unsigned long long)end);
 	/*
 	 * If both edges are on a cluster boundary then there's no
 	 * zeroing required as the region is part of the allocation to
 	 * be truncated.
 	 */
 	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
 		goto out;
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * We want to get the byte offset of the end of the 1st cluster.
 	 */
 	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
 	if (tmpend > end)
 		tmpend = end;
 	mlog(0, "1st range: start: %llu, tmpend: %llu\n",
 	     (unsigned long long)start, (unsigned long long)tmpend);
 	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
 	if (ret)
 		mlog_errno(ret);
 	if (tmpend < end) {
 		/*
 		 * This may make start and end equal, but the zeroing
 		 * code will skip any work in that case so there's no
 		 * need to catch it up here.
 		 */
 		start = end & ~(osb->s_clustersize - 1);
 		mlog(0, "2nd range: start: %llu, end: %llu\n",
 		     (unsigned long long)start, (unsigned long long)end);
 		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
 		if (ret)
 			mlog_errno(ret);
 	}
 	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
 static int ocfs2_remove_inode_range(struct inode *inode,
 				    struct buffer_head *di_bh, u64 byte_start,
 				    u64 byte_len)
 {
 	int ret = 0;
 	u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
 	struct ocfs2_extent_tree et;
 	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 	if (byte_len == 0)
 		return 0;
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
 					    byte_start + byte_len, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		/*
 		 * There's no need to get fancy with the page cache
 		 * truncate of an inline-data inode. We're talking
 		 * about less than a page here, which will be cached
 		 * in the dinode buffer anyway.
 		 */
 		unmap_mapping_range(mapping, 0, 0, 0);
 		truncate_inode_pages(mapping, 0);
 		goto out;
 	}
 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
 	trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
 	if (trunc_len >= trunc_start)
 		trunc_len -= trunc_start;
 	else
 		trunc_len = 0;
 	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (unsigned long long)byte_start,
 	     (unsigned long long)byte_len, trunc_start, trunc_len);
 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	cpos = trunc_start;
 	while (trunc_len) {
 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
 					 &alloc_size, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 		if (alloc_size > trunc_len)
 			alloc_size = trunc_len;
 		/* Only do work for non-holes */
 		if (phys_cpos != 0) {
 			ret = ocfs2_remove_btree_range(inode, &et, cpos,
 						       phys_cpos, alloc_size,
 						       &dealloc);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 		}
 		cpos += alloc_size;
 		trunc_len -= alloc_size;
 	}
 	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
 out:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &dealloc);
 	return ret;
 }
 /*
  * Parts of this function taken from xfs_change_file_space()
  */
 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 				     loff_t f_pos, unsigned int cmd,
 				     struct ocfs2_space_resv *sr,
 				     int change_size)
 {
 	int ret;
 	s64 llen;
 	loff_t size;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *di_bh = NULL;
 	handle_t *handle;
 	unsigned long long max_off = inode->i_sb->s_maxbytes;
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 	mutex_lock(&inode->i_mutex);
 	/*
 	 * This prevents concurrent writes on other nodes
 	 */
 	ret = ocfs2_rw_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_rw_unlock;
 	}
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		ret = -EPERM;
 		goto out_inode_unlock;
 	}
 	switch (sr->l_whence) {
 	case 0: /*SEEK_SET*/
 		break;
 	case 1: /*SEEK_CUR*/
 		sr->l_start += f_pos;
 		break;
 	case 2: /*SEEK_END*/
 		sr->l_start += i_size_read(inode);
 		break;
 	default:
 		ret = -EINVAL;
 		goto out_inode_unlock;
 	}
 	sr->l_whence = 0;
 	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
 	if (sr->l_start < 0
 	    || sr->l_start > max_off
 	    || (sr->l_start + llen) < 0
 	    || (sr->l_start + llen) > max_off) {
 		ret = -EINVAL;
 		goto out_inode_unlock;
 	}
 	size = sr->l_start + sr->l_len;
 	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
 		if (sr->l_len <= 0) {
 			ret = -EINVAL;
 			goto out_inode_unlock;
 		}
 	}
 	if (file && should_remove_suid(file->f_path.dentry)) {
 		ret = __ocfs2_write_remove_suid(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_inode_unlock;
 		}
 	}
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	switch (cmd) {
 	case OCFS2_IOC_RESVSP:
 	case OCFS2_IOC_RESVSP64:
 		/*
 		 * This takes unsigned offsets, but the signed ones we
 		 * pass have been checked against overflow above.
 		 */
 		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
 						       sr->l_len);
 		break;
 	case OCFS2_IOC_UNRESVSP:
 	case OCFS2_IOC_UNRESVSP64:
 		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
 					       sr->l_len);
 		break;
 	default:
 		ret = -EINVAL;
 	}
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_inode_unlock;
 	}
 	/*
 	 * We update c/mtime for these changes
 	 */
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out_inode_unlock;
 	}
 	if (change_size && i_size_read(inode) < size)
 		i_size_write(inode, size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
 	if (ret < 0)
 		mlog_errno(ret);
 	ocfs2_commit_trans(osb, handle);
 out_inode_unlock:
 	brelse(di_bh);
 	ocfs2_inode_unlock(inode, 1);
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
 	    !ocfs2_writes_unwritten_extents(osb))
 		return -ENOTTY;
 	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
 		 !ocfs2_sparse_alloc(osb))
 		return -ENOTTY;
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
 }
 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 			    loff_t len)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
 	if (!ocfs2_writes_unwritten_extents(osb))
 		return -EOPNOTSUPP;
 	if (S_ISDIR(inode->i_mode))
 		return -ENODEV;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		change_size = 0;
 	sr.l_whence = 0;
 	sr.l_start = (s64)offset;
 	sr.l_len = (s64)len;
 	return __ocfs2_change_file_space(NULL, inode, offset,
 					 OCFS2_IOC_RESVSP64, &sr, change_size);
 }
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
 					 int appending,
 					 int *direct_io)
 {
 	int ret = 0, meta_level = 0;
 	struct inode *inode = dentry->d_inode;
 	loff_t saved_pos, end;
 	/*
 	 * We start with a read level meta lock and only jump to an ex
 	 * if we need to make modifications here.
 	 */
 	for(;;) {
 		ret = ocfs2_inode_lock(inode, NULL, meta_level);
 		if (ret < 0) {
 			meta_level = -1;
 			mlog_errno(ret);
 			goto out;
 		}
 		/* Clear suid / sgid if necessary. We do this here
 		 * instead of later in the write path because
 		 * remove_suid() calls ->setattr without any hint that
 		 * we may have already done our cluster locking. Since
 		 * ocfs2_setattr() *must* take cluster locks to
 		 * proceeed, this will lead us to recursively lock the
 		 * inode. There's also the dinode i_size state which
 		 * can be lost via setattr during extending writes (we
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
 				ocfs2_inode_unlock(inode, meta_level);
 				meta_level = 1;
 				continue;
 			}
 			ret = ocfs2_write_remove_suid(inode);
 			if (ret < 0) {
 				mlog_errno(ret);
 				goto out_unlock;
 			}
 		}
 		/* work on a copy of ppos until we're sure that we won't have
 		 * to recalculate it due to relocking. */
 		if (appending) {
 			saved_pos = i_size_read(inode);
 			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
 		} else {
 			saved_pos = *ppos;
 		}
 		end = saved_pos + count;
 		/*
 		 * Skip the O_DIRECT checks if we don't need
 		 * them.
 		 */
 		if (!direct_io || !(*direct_io))
 			break;
 		/*
 		 * There's no sane way to do direct writes to an inode
 		 * with inline data.
 		 */
 		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 			*direct_io = 0;
 			break;
 		}
 		/*
 		 * Allowing concurrent direct writes means
 		 * i_size changes wouldn't be synchronized, so
 		 * one node could wind up truncating another
 		 * nodes writes.
 		 */
 		if (end > i_size_read(inode)) {
 			*direct_io = 0;
 			break;
 		}
 		/*
 		 * We don't fill holes during direct io, so
 		 * check for them here. If any are found, the
 		 * caller will have to retake some cluster
 		 * locks and initiate the io as buffered.
 		 */
 		ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
 		if (ret == 1) {
 			*direct_io = 0;
 			ret = 0;
 		} else if (ret < 0)
 			mlog_errno(ret);
 		break;
 	}
 	if (appending)
 		*ppos = saved_pos;
 out_unlock:
 	ocfs2_inode_unlock(inode, meta_level);
 out:
 	return ret;
 }
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
 	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
 	int can_do_direct;
 	ssize_t written = 0;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	loff_t old_size, *ppos = &iocb->ki_pos;
 	u32 old_clusters;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
 		   file->f_path.dentry->d_name.len,
 		   file->f_path.dentry->d_name.name);
 	if (iocb->ki_left == 0)
 		return 0;
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
 	mutex_lock(&inode->i_mutex);
 relock:
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (direct_io) {
 		down_read(&inode->i_alloc_sem);
 		have_alloc_sem = 1;
 	}
 	/* concurrent O_DIRECT writes are allowed */
 	rw_level = !direct_io;
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_sems;
 	}
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
 					    iocb->ki_left, appending,
 					    &can_do_direct);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
 	 * buffered I/O.
 	 */
 	if (direct_io && !can_do_direct) {
 		ocfs2_rw_unlock(inode, rw_level);
 		up_read(&inode->i_alloc_sem);
 		have_alloc_sem = 0;
 		rw_level = -1;
 		direct_io = 0;
 		goto relock;
 	}
 	/*
 	 * To later detect whether a journal commit for sync writes is
 	 * necessary, we sample i_size, and cluster count here.
 	 */
 	old_size = i_size_read(inode);
 	old_clusters = OCFS2_I(inode)->ip_clusters;
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	if (direct_io) {
 		ret = generic_segment_checks(iov, &nr_segs, &ocount,
 					     VERIFY_READ);
 		if (ret)
 			goto out_dio;
 		ret = generic_write_checks(file, ppos, &count,
 					   S_ISBLK(inode->i_mode));
 		if (ret)
 			goto out_dio;
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
 			/*
 			 * direct write may have instantiated a few
 			 * blocks outside i_size. Trim these off again.
 			 * Don't need i_size_read because we hold i_mutex.
 			 */
 			if (*ppos + count > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
 	} else {
 		written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
 							*ppos);
 	}
 out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
 		/*
 		 * The generic write paths have handled getting data
 		 * to disk, but since we don't make use of the dirty
 		 * inode list, a manual journal commit is necessary
 		 * here.
 		 */
 		if (old_size != i_size_read(inode) ||
 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
 	}
 	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
 	 * it can unlock our rw lock.  (it's the clustered equivalent of
 	 * i_alloc_sem; protects truncate from racing with pending ios).
 	 * Unfortunately there are error cases which call end_io and others
 	 * that don't.  so we don't have to unlock the rw_lock if either an
 	 * async dio is going to do it in the future or an end_io after an
 	 * error has already done it.
 	 */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 out_sems:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
 	mutex_unlock(&inode->i_mutex);
 	mlog_exit(ret);
 	return written ? written : ret;
 }
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 				       struct file *out,
 				       loff_t *ppos,
 				       size_t len,
 				       unsigned int flags)
 {
 	int ret;
 	struct inode *inode = out->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
 		   (unsigned int)len,
 		   out->f_path.dentry->d_name.len,
 		   out->f_path.dentry->d_name.name);
-	inode_double_lock(inode, pipe->inode);
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
 	ret = ocfs2_rw_lock(inode, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
 					    NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 out_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
-	inode_double_unlock(inode, pipe->inode);
+	mutex_unlock(&inode->i_mutex);
 	mlog_exit(ret);
 	return ret;
 }
 static ssize_t ocfs2_file_splice_read(struct file *in,
 				      loff_t *ppos,
 				      struct pipe_inode_info *pipe,
 				      size_t len,
 				      unsigned int flags)
 {
 	int ret = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
 		   (unsigned int)len,
 		   in->f_path.dentry->d_name.len,
 		   in->f_path.dentry->d_name.name);
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_inode_unlock(inode, 0);
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 bail:
 	mlog_exit(ret);
 	return ret;
 }
 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 				   const struct iovec *iov,
 				   unsigned long nr_segs,
 				   loff_t pos)
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
 		   (unsigned int)nr_segs,
 		   filp->f_path.dentry->d_name.len,
 		   filp->f_path.dentry->d_name.name);
 	if (!inode) {
 		ret = -EINVAL;
 		mlog_errno(ret);
 		goto bail;
 	}
 	/*
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
 	 */
 	if (filp->f_flags & O_DIRECT) {
 		down_read(&inode->i_alloc_sem);
 		have_alloc_sem = 1;
 		ret = ocfs2_rw_lock(inode, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto bail;
 		}
 		rw_level = 0;
 		/* communicate with ocfs2_dio_end_io */
 		ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	}
 	/*
 	 * We're fine letting folks race truncates and extending
 	 * writes with read across the cluster, just like they can
 	 * locally. Hence no rw_lock during read.
 	 *
 	 * Take and drop the meta data lock to update inode fields
 	 * like i_size. This allows the checks down below
 	 * generic_file_aio_read() a chance of actually working.
 	 */
 	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
 	ocfs2_inode_unlock(inode, lock_level);
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
 		mlog(0, "generic_file_aio_read returned -EINVAL\n");
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
 	/* see ocfs2_file_aio_write */
 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
 bail:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 	mlog_exit(ret);
 	return ret;
 }
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
 const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 };
 /*
  * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
  */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
 const struct file_operations ocfs2_dops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 };
 /*
  * POSIX-lockless variants of our file_operations.
  *
  * These will be used if the underlying cluster stack does not support
  * posix file locking, if the user passes the "localflocks" mount
  * option, or if we have a local-only fs.
  *
  * ocfs2_flock is in here because all stacks handle UNIX file locks,
  * so we still want it in the case of no stack support for
  * plocks. Internally, it will do the right thing when asked to ignore
  * the cluster.
  */
 const struct file_operations ocfs2_fops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_file_release,
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
 const struct file_operations ocfs2_dops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 };

 /*
  * "splice": joining two ropes together by interweaving their strands.
  *
  * This is the "extended pipe" functionality, where a pipe is used as
  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
  * buffer that you can use to transfer data from one end to the other.
  *
  * The traditional unix read/write is extended with a "splice()" operation
  * that transfers data buffers to or from a pipe buffer.
  *
  * Named by Larry McVoy, original implementation from Linus, extended by
  * Jens to support splicing to files, network, direct splicing, etc and
  * fixing lots of bugs.
  *
  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
  */
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
  * a vm helper function, it's already simplified quite a bit by the
  * addition of remove_mapping(). If success is returned, the caller may
  * attempt to reuse this page for another destination.
  */
 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	struct address_space *mapping;
 	lock_page(page);
 	mapping = page_mapping(page);
 	if (mapping) {
 		WARN_ON(!PageUptodate(page));
 		/*
 		 * At least for ext2 with nobh option, we need to wait on
 		 * writeback completing on this page, since we'll remove it
 		 * from the pagecache.  Otherwise truncate wont wait on the
 		 * page, allowing the disk blocks to be reused by someone else
 		 * before we actually wrote our data to them. fs corruption
 		 * ensues.
 		 */
 		wait_on_page_writeback(page);
 		if (page_has_private(page) &&
 		    !try_to_release_page(page, GFP_KERNEL))
 			goto out_unlock;
 		/*
 		 * If we succeeded in removing the mapping, set LRU flag
 		 * and return good.
 		 */
 		if (remove_mapping(mapping, page)) {
 			buf->flags |= PIPE_BUF_FLAG_LRU;
 			return 0;
 		}
 	}
 	/*
 	 * Raced with truncate or failed to remove page from current
 	 * address space, unlock and return failure.
 	 */
 out_unlock:
 	unlock_page(page);
 	return 1;
 }
 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 /*
  * Check whether the contents of buf is OK to access. Since the content
  * is a page cache page, IO may be in flight.
  */
 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 				       struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	int err;
 	if (!PageUptodate(page)) {
 		lock_page(page);
 		/*
 		 * Page got truncated/unhashed. This will cause a 0-byte
 		 * splice, if this is the first page.
 		 */
 		if (!page->mapping) {
 			err = -ENODATA;
 			goto error;
 		}
 		/*
 		 * Uh oh, read-error from disk.
 		 */
 		if (!PageUptodate(page)) {
 			err = -EIO;
 			goto error;
 		}
 		/*
 		 * Page is ok afterall, we are done.
 		 */
 		unlock_page(page);
 	}
 	return 0;
 error:
 	unlock_page(page);
 	return err;
 }
 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = generic_pipe_buf_map,
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = page_cache_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 				    struct pipe_buffer *buf)
 {
 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 		return 1;
 	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return generic_pipe_buf_steal(pipe, buf);
 }
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = generic_pipe_buf_map,
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 /**
  * splice_to_pipe - fill passed data into a pipe
  * @pipe:	pipe to fill
  * @spd:	data to fill
  *
  * Description:
  *    @spd contains a map of pages and len/offset tuples, along with
  *    the struct pipe_buf_operations associated with these pages. This
  *    function will link that data to the pipe.
  *
  */
 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		       struct splice_pipe_desc *spd)
 {
 	unsigned int spd_pages = spd->nr_pages;
 	int ret, do_wakeup, page_nr;
 	ret = 0;
 	do_wakeup = 0;
 	page_nr = 0;
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
 	for (;;) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			buf->page = spd->pages[page_nr];
 			buf->offset = spd->partial[page_nr].offset;
 			buf->len = spd->partial[page_nr].len;
 			buf->private = spd->partial[page_nr].private;
 			buf->ops = spd->ops;
 			if (spd->flags & SPLICE_F_GIFT)
 				buf->flags |= PIPE_BUF_FLAG_GIFT;
 			pipe->nrbufs++;
 			page_nr++;
 			ret += buf->len;
 			if (pipe->inode)
 				do_wakeup = 1;
 			if (!--spd->nr_pages)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
 			break;
 		}
 		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
 			if (!ret)
 				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
 			smp_mb();
 			if (waitqueue_active(&pipe->wait))
 				wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
 		pipe->waiting_writers++;
 		pipe_wait(pipe);
 		pipe->waiting_writers--;
 	}
 	if (pipe->inode) {
 		mutex_unlock(&pipe->inode->i_mutex);
 		if (do_wakeup) {
 			smp_mb();
 			if (waitqueue_active(&pipe->wait))
 				wake_up_interruptible(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		}
 	}
 	while (page_nr < spd_pages)
 		spd->spd_release(spd, page_nr++);
 	return ret;
 }
 static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 {
 	page_cache_release(spd->pages[i]);
 }
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages, req_pages;
 	struct page *pages[PIPE_BUFFERS];
 	struct partial_page partial[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
 	int error, page_nr;
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
 		.flags = flags,
 		.ops = &page_cache_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 	index += spd.nr_pages;
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
 	 * readahead/allocate the rest and fill in the holes.
 	 */
 	if (spd.nr_pages < nr_pages)
 		page_cache_sync_readahead(mapping, &in->f_ra, in,
 				index, req_pages - spd.nr_pages);
 	error = 0;
 	while (spd.nr_pages < nr_pages) {
 		/*
 		 * Page could be there, find_get_pages_contig() breaks on
 		 * the first hole.
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
 			/*
 			 * page didn't exist, allocate one.
 			 */
 			page = page_cache_alloc_cold(mapping);
 			if (!page)
 				break;
 			error = add_to_page_cache_lru(page, mapping, index,
 						mapping_gfp_mask(mapping));
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
 					continue;
 				break;
 			}
 			/*
 			 * add_to_page_cache() locks the page, unlock it
 			 * to avoid convoluting the logic below even more.
 			 */
 			unlock_page(page);
 		}
 		pages[spd.nr_pages++] = page;
 		index++;
 	}
 	/*
 	 * Now loop over the map and see if we need to start IO on any
 	 * pages, fill in the partial map, etc.
 	 */
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	nr_pages = spd.nr_pages;
 	spd.nr_pages = 0;
 	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 		unsigned int this_len;
 		if (!len)
 			break;
 		/*
 		 * this_len is the max we'll use from this page
 		 */
 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 		page = pages[page_nr];
 		if (PageReadahead(page))
 			page_cache_async_readahead(mapping, &in->f_ra, in,
 					page, index, req_pages - page_nr);
 		/*
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
 			/*
 			 * If in nonblock mode then dont block on waiting
 			 * for an in-flight io page
 			 */
 			if (flags & SPLICE_F_NONBLOCK) {
 				if (!trylock_page(page)) {
 					error = -EAGAIN;
 					break;
 				}
 			} else
 				lock_page(page);
 			/*
 			 * Page was truncated, or invalidated by the
 			 * filesystem.  Redo the find/create, but this time the
 			 * page is kept locked, so there's no chance of another
 			 * race with truncate/invalidate.
 			 */
 			if (!page->mapping) {
 				unlock_page(page);
 				page = find_or_create_page(mapping, index,
 						mapping_gfp_mask(mapping));
 				if (!page) {
 					error = -ENOMEM;
 					break;
 				}
 				page_cache_release(pages[page_nr]);
 				pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
 			 */
 			if (PageUptodate(page)) {
 				unlock_page(page);
 				goto fill_it;
 			}
 			/*
 			 * need to read in the page
 			 */
 			error = mapping->a_ops->readpage(in, page);
 			if (unlikely(error)) {
 				/*
 				 * We really should re-lookup the page here,
 				 * but it complicates things a lot. Instead
 				 * lets just do what we already stored, and
 				 * we'll get it the next time we are called.
 				 */
 				if (error == AOP_TRUNCATED_PAGE)
 					error = 0;
 				break;
 			}
 		}
 fill_it:
 		/*
 		 * i_size must be checked after PageUptodate.
 		 */
 		isize = i_size_read(mapping->host);
 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 		if (unlikely(!isize || index > end_index))
 			break;
 		/*
 		 * if this is the last page, see if we need to shrink
 		 * the length and stop
 		 */
 		if (end_index == index) {
 			unsigned int plen;
 			/*
 			 * max good bytes in this page
 			 */
 			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 			if (plen <= loff)
 				break;
 			/*
 			 * force quit after adding this page
 			 */
 			this_len = min(this_len, plen - loff);
 			len = this_len;
 		}
 		partial[page_nr].offset = loff;
 		partial[page_nr].len = this_len;
 		len -= this_len;
 		loff = 0;
 		spd.nr_pages++;
 		index++;
 	}
 	/*
 	 * Release any pages at the end, if we quit early. 'page_nr' is how far
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
 		page_cache_release(pages[page_nr++]);
 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 	if (spd.nr_pages)
 		return splice_to_pipe(pipe, &spd);
 	return error;
 }
 /**
  * generic_file_splice_read - splice data from file to a pipe
  * @in:		file to splice from
  * @ppos:	position in @in
  * @pipe:	pipe to splice to
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
  * Description:
  *    Will read pages from given file and fill them into a pipe. Can be
  *    used as long as the address_space operations for the source implements
  *    a readpage() hook.
  *
  */
 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags)
 {
 	loff_t isize, left;
 	int ret;
 	isize = i_size_read(in->f_mapping->host);
 	if (unlikely(*ppos >= isize))
 		return 0;
 	left = isize - *ppos;
 	if (unlikely(left < len))
 		len = left;
 	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 	if (ret > 0)
 		*ppos += ret;
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_read);
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
  */
 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 			    struct pipe_buffer *buf, struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
 	loff_t pos = sd->pos;
 	int ret, more;
 	ret = buf->ops->confirm(pipe, buf);
 	if (!ret) {
 		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 		ret = file->f_op->sendpage(file, buf->page, buf->offset,
 					   sd->len, &pos, more);
 	}
 	return ret;
 }
 /*
  * This is a little more tricky than the file -> pipe splicing. There are
  * basically three cases:
  *
  *	- Destination page already exists in the address space and there
  *	  are users of it. For that case we have no other option that
  *	  copying the data. Tough luck.
  *	- Destination page already exists in the address space, but there
  *	  are no users of it. Make sure it's uptodate, then drop it. Fall
  *	  through to last case.
  *	- Destination page does not exist, we can add the pipe page to
  *	  the page cache and avoid the copy.
  *
  * If asked to move pages to the output file (SPLICE_F_MOVE is set in
  * sd->flags), we attempt to migrate pages from the pipe to the output
  * file address space page cache. This is possible if no one else has
  * the pipe page referenced outside of the pipe and page cache. If
  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
  * a new page in the output file page cache and fill/dirty that.
  */
 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
 	struct address_space *mapping = file->f_mapping;
 	unsigned int offset, this_len;
 	struct page *page;
 	void *fsdata;
 	int ret;
 	/*
 	 * make sure the data in this buffer is uptodate
 	 */
 	ret = buf->ops->confirm(pipe, buf);
 	if (unlikely(ret))
 		return ret;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 	this_len = sd->len;
 	if (this_len + offset > PAGE_CACHE_SIZE)
 		this_len = PAGE_CACHE_SIZE - offset;
 	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
 				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (unlikely(ret))
 		goto out;
 	if (buf->page != page) {
 		/*
 		 * Careful, ->map() uses KM_USER0!
 		 */
 		char *src = buf->ops->map(pipe, buf, 1);
 		char *dst = kmap_atomic(page, KM_USER1);
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER1);
 		buf->ops->unmap(pipe, buf, src);
 	}
 	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 				page, fsdata);
 out:
 	return ret;
 }
 /**
  * __splice_from_pipe - splice data from a pipe to given actor
  * @pipe:	pipe to splice from
  * @sd:		information to @actor
  * @actor:	handler that splices the data
  *
  * Description:
  *    This function does little more than loop over the pipe and call
  *    @actor to do the actual moving of a single struct pipe_buffer to
  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
  *    pipe_to_user.
  *
  */
 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
 			   splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	ret = 0;
 	do_wakeup = 0;
 	for (;;) {
 		if (pipe->nrbufs) {
 			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 			const struct pipe_buf_operations *ops = buf->ops;
 			sd->len = buf->len;
 			if (sd->len > sd->total_len)
 				sd->len = sd->total_len;
 			err = actor(pipe, buf, sd);
 			if (err <= 0) {
 				if (!ret && err != -ENODATA)
 					ret = err;
 				break;
 			}
 			ret += err;
 			buf->offset += err;
 			buf->len -= err;
 			sd->len -= err;
 			sd->pos += err;
 			sd->total_len -= err;
 			if (sd->len)
 				continue;
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
 				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
 				pipe->nrbufs--;
 				if (pipe->inode)
 					do_wakeup = 1;
 			}
 			if (!sd->total_len)
 				break;
 		}
 		if (pipe->nrbufs)
 			continue;
 		if (!pipe->writers)
 			break;
 		if (!pipe->waiting_writers) {
 			if (ret)
 				break;
 		}
 		if (sd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
 			if (!ret)
 				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
 			smp_mb();
 			if (waitqueue_active(&pipe->wait))
 				wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 			do_wakeup = 0;
 		}
 		pipe_wait(pipe);
 	}
 	if (do_wakeup) {
 		smp_mb();
 		if (waitqueue_active(&pipe->wait))
 			wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(__splice_from_pipe);
 /**
  * splice_from_pipe - splice data from a pipe to a file
  * @pipe:	pipe to splice from
  * @out:	file to splice to
  * @ppos:	position in @out
  * @len:	how many bytes to splice
  * @flags:	splice modifier flags
  * @actor:	handler that splices the data
  *
  * Description:
  *    See __splice_from_pipe. This function locks the input and output inodes,
  *    otherwise it's identical to __splice_from_pipe().
  *
  */
 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 			 loff_t *ppos, size_t len, unsigned int flags,
 			 splice_actor *actor)
 {
 	ssize_t ret;
 	struct inode *inode = out->f_mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
 		.pos = *ppos,
 		.u.file = out,
 	};
 	/*
 	 * The actor worker might be calling ->write_begin and
 	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
+	 *
+	 * Outer lock must be inode->i_mutex, as pipe_wait() will
+	 * release and reacquire pipe->inode->i_mutex, AND inode must
+	 * never be a pipe.
 	 */
-	inode_double_lock(inode, pipe->inode);
+	WARN_ON(S_ISFIFO(inode->i_mode));
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 	ret = __splice_from_pipe(pipe, &sd, actor);
-	inode_double_unlock(inode, pipe->inode);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 /**
  * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
  * @pipe:	pipe info
  * @out:	file to write to
  * @ppos:	position in @out
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
  * Description:
  *    Will either move or copy pages (determined by @flags options) from
  *    the given pipe inode to the given file. The caller is responsible
  *    for acquiring i_mutex on both inodes.
  *
  */
 ssize_t
 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
 				 loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
 		.pos = *ppos,
 		.u.file = out,
 	};
 	ssize_t ret;
 	int err;
 	err = file_remove_suid(out);
 	if (unlikely(err))
 		return err;
 	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
 	if (ret > 0) {
 		unsigned long nr_pages;
 		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 		/*
 		 * If file or inode is SYNC and we actually wrote some data,
 		 * sync it.
 		 */
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
 			if (err)
 				ret = err;
 		}
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_write_nolock);
 /**
  * generic_file_splice_write - splice data from a pipe to a file
  * @pipe:	pipe info
  * @out:	file to write to
  * @ppos:	position in @out
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
  * Description:
  *    Will either move or copy pages (determined by @flags options) from
  *    the given pipe inode to the given file.
  *
  */
 ssize_t
 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
 	struct splice_desc sd = {
 		.total_len = len,
 		.flags = flags,
 		.pos = *ppos,
 		.u.file = out,
 	};
 	ssize_t ret;
-	inode_double_lock(inode, pipe->inode);
+	WARN_ON(S_ISFIFO(inode->i_mode));
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
 	ret = file_remove_suid(out);
-	if (likely(!ret))
+	if (likely(!ret)) {
+		if (pipe->inode)
+			mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-	inode_double_unlock(inode, pipe->inode);
+		if (pipe->inode)
+			mutex_unlock(&pipe->inode->i_mutex);
+	}
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0) {
 		unsigned long nr_pages;
 		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 		/*
 		 * If file or inode is SYNC and we actually wrote some data,
 		 * sync it.
 		 */
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			int err;
 			mutex_lock(&inode->i_mutex);
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
 			mutex_unlock(&inode->i_mutex);
 			if (err)
 				ret = err;
 		}
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_write);
 /**
  * generic_splice_sendpage - splice data from a pipe to a socket
  * @pipe:	pipe to splice from
  * @out:	socket to write to
  * @ppos:	position in @out
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
  * Description:
  *    Will send @len bytes from the pipe to a network socket. No data copying
  *    is involved.
  *
  */
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 				loff_t *ppos, size_t len, unsigned int flags)
 {
 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
 	int ret;
 	if (unlikely(!out->f_op || !out->f_op->splice_write))
 		return -EINVAL;
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 	if (unlikely(out->f_flags & O_APPEND))
 		return -EINVAL;
 	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
 static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
 	int ret;
 	if (unlikely(!in->f_op || !in->f_op->splice_read))
 		return -EINVAL;
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 	ret = rw_verify_area(READ, in, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
  * @in:		file to splice from
  * @sd:		actor information on where to splice to
  * @actor:	handles the data splicing
  *
  * Description:
  *    This is a special case helper to splice directly between two
  *    points, without requiring an explicit pipe. Internally an allocated
  *    pipe is cached in the process, and reused during the lifetime of
  *    that process.
  *
  */
 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 			       splice_direct_actor *actor)
 {
 	struct pipe_inode_info *pipe;
 	long ret, bytes;
 	umode_t i_mode;
 	size_t len;
 	int i, flags;
 	/*
 	 * We require the input being a regular file, as we don't want to
 	 * randomly drop data for eg socket -> socket splicing. Use the
 	 * piped splicing for that!
 	 */
 	i_mode = in->f_path.dentry->d_inode->i_mode;
 	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 		return -EINVAL;
 	/*
 	 * neither in nor out is a pipe, setup an internal pipe attached to
 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
 	 */
 	pipe = current->splice_pipe;
 	if (unlikely(!pipe)) {
 		pipe = alloc_pipe_info(NULL);
 		if (!pipe)
 			return -ENOMEM;
 		/*
 		 * We don't have an immediate reader, but we'll read the stuff
 		 * out of the pipe right after the splice_to_pipe(). So set
 		 * PIPE_READERS appropriately.
 		 */
 		pipe->readers = 1;
 		current->splice_pipe = pipe;
 	}
 	/*
 	 * Do the splice.
 	 */
 	ret = 0;
 	bytes = 0;
 	len = sd->total_len;
 	flags = sd->flags;
 	/*
 	 * Don't block on output, we have to drain the direct pipe.
 	 */
 	sd->flags &= ~SPLICE_F_NONBLOCK;
 	while (len) {
 		size_t read_len;
 		loff_t pos = sd->pos, prev_pos = pos;
 		ret = do_splice_to(in, &pos, pipe, len, flags);
 		if (unlikely(ret <= 0))
 			goto out_release;
 		read_len = ret;
 		sd->total_len = read_len;
 		/*
 		 * NOTE: nonblocking mode only applies to the input. We
 		 * must not do the output in nonblocking mode as then we
 		 * could get stuck data in the internal pipe:
 		 */
 		ret = actor(pipe, sd);
 		if (unlikely(ret <= 0)) {
 			sd->pos = prev_pos;
 			goto out_release;
 		}
 		bytes += ret;
 		len -= ret;
 		sd->pos = pos;
 		if (ret < read_len) {
 			sd->pos = prev_pos + ret;
 			goto out_release;
 		}
 	}
 done:
 	pipe->nrbufs = pipe->curbuf = 0;
 	file_accessed(in);
 	return bytes;
 out_release:
 	/*
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
 	for (i = 0; i < PIPE_BUFFERS; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops) {
 			buf->ops->release(pipe, buf);
 			buf->ops = NULL;
 		}
 	}
 	if (!bytes)
 		bytes = ret;
 	goto done;
 }
 EXPORT_SYMBOL(splice_direct_to_actor);
 static int direct_splice_actor(struct pipe_inode_info *pipe,
 			       struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
 	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
 }
 /**
  * do_splice_direct - splices data directly between two files
  * @in:		file to splice from
  * @ppos:	input file offset
  * @out:	file to splice to
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
  * Description:
  *    For use by do_sendfile(). splice can easily emulate sendfile, but
  *    doing it in the application would incur an extra system call
  *    (splice in + splice out, as compared to just sendfile()). So this helper
  *    can splice directly through a process-private pipe.
  *
  */
 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		      size_t len, unsigned int flags)
 {
 	struct splice_desc sd = {
 		.len		= len,
 		.total_len	= len,
 		.flags		= flags,
 		.pos		= *ppos,
 		.u.file		= out,
 	};
 	long ret;
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
 		*ppos = sd.pos;
 	return ret;
 }
 /*
  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  * location, so checking ->i_pipe is not enough to verify that this is a
  * pipe.
  */
 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
 {
 	if (S_ISFIFO(inode->i_mode))
 		return inode->i_pipe;
 	return NULL;
 }
 /*
  * Determine where to splice to/from.
  */
 static long do_splice(struct file *in, loff_t __user *off_in,
 		      struct file *out, loff_t __user *off_out,
 		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	loff_t offset, *off;
 	long ret;
 	pipe = pipe_info(in->f_path.dentry->d_inode);
 	if (pipe) {
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
 			if (out->f_op->llseek == no_llseek)
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
 			off = &offset;
 		} else
 			off = &out->f_pos;
 		ret = do_splice_from(pipe, out, off, len, flags);
 		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
 			ret = -EFAULT;
 		return ret;
 	}
 	pipe = pipe_info(out->f_path.dentry->d_inode);
 	if (pipe) {
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
 			if (in->f_op->llseek == no_llseek)
 				return -EINVAL;
 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
 			off = &offset;
 		} else
 			off = &in->f_pos;
 		ret = do_splice_to(in, off, pipe, len, flags);
 		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
 			ret = -EFAULT;
 		return ret;
 	}
 	return -EINVAL;
 }
 /*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
  * Could easily be exported as a generic helper for other users, in which
  * case one would probably want to add a 'max_nr_pages' parameter as well.
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
 				struct partial_page *partial, int aligned)
 {
 	int buffers = 0, error = 0;
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
 		void __user *base;
 		size_t len;
 		int i;
 		error = -EFAULT;
 		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 		base = entry.iov_base;
 		len = entry.iov_len;
 		/*
 		 * Sanity check this iovec. 0 read succeeds.
 		 */
 		error = 0;
 		if (unlikely(!len))
 			break;
 		error = -EFAULT;
 		if (!access_ok(VERIFY_READ, base, len))
 			break;
 		/*
 		 * Get this base offset and number of pages, then map
 		 * in the user pages.
 		 */
 		off = (unsigned long) base & ~PAGE_MASK;
 		/*
 		 * If asked for alignment, the offset must be zero and the
 		 * length a multiple of the PAGE_SIZE.
 		 */
 		error = -EINVAL;
 		if (aligned && (off || len & ~PAGE_MASK))
 			break;
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 		error = get_user_pages_fast((unsigned long)base, npages,
 					0, &pages[buffers]);
 		if (unlikely(error <= 0))
 			break;
 		/*
 		 * Fill this contiguous range into the partial page map.
 		 */
 		for (i = 0; i < error; i++) {
 			const int plen = min_t(size_t, len, PAGE_SIZE - off);
 			partial[buffers].offset = off;
 			partial[buffers].len = plen;
 			off = 0;
 			len -= plen;
 			buffers++;
 		}
 		/*
 		 * We didn't complete this iov, stop here since it probably
 		 * means we have to move some of this into a pipe to
 		 * be able to continue.
 		 */
 		if (len)
 			break;
 		/*
 		 * Don't continue if we mapped fewer pages than we asked for,
 		 * or if we mapped the max number of pages that we have
 		 * room for.
 		 */
 		if (error < npages || buffers == PIPE_BUFFERS)
 			break;
 		nr_vecs--;
 		iov++;
 	}
 	if (buffers)
 		return buffers;
 	return error;
 }
 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			struct splice_desc *sd)
 {
 	char *src;
 	int ret;
 	ret = buf->ops->confirm(pipe, buf);
 	if (unlikely(ret))
 		return ret;
 	/*
 	 * See if we can use the atomic maps, by prefaulting in the
 	 * pages and doing an atomic copy
 	 */
 	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
 		src = buf->ops->map(pipe, buf, 1);
 		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
 							sd->len);
 		buf->ops->unmap(pipe, buf, src);
 		if (!ret) {
 			ret = sd->len;
 			goto out;
 		}
 	}
 	/*
 	 * No dice, use slow non-atomic map and copy
  	 */
 	src = buf->ops->map(pipe, buf, 0);
 	ret = sd->len;
 	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
 		ret = -EFAULT;
 	buf->ops->unmap(pipe, buf, src);
 out:
 	if (ret > 0)
 		sd->u.userptr += ret;
 	return ret;
 }
 /*
  * For lack of a better implementation, implement vmsplice() to userspace
  * as a simple copy of the pipes pages to the user iov.
  */
 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	struct splice_desc sd;
 	ssize_t size;
 	int error;
 	long ret;
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
 	error = ret = 0;
 	while (nr_segs) {
 		void __user *base;
 		size_t len;
 		/*
 		 * Get user address base and length for this iovec.
 		 */
 		error = get_user(base, &iov->iov_base);
 		if (unlikely(error))
 			break;
 		error = get_user(len, &iov->iov_len);
 		if (unlikely(error))
 			break;
 		/*
 		 * Sanity check this iovec. 0 read succeeds.
 		 */
 		if (unlikely(!len))
 			break;
 		if (unlikely(!base)) {
 			error = -EFAULT;
 			break;
 		}
 		if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
 			error = -EFAULT;
 			break;
 		}
 		sd.len = 0;
 		sd.total_len = len;
 		sd.flags = flags;
 		sd.u.userptr = base;
 		sd.pos = 0;
 		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
 		if (size < 0) {
 			if (!ret)
 				ret = size;
 			break;
 		}
 		ret += size;
 		if (size < len)
 			break;
 		nr_segs--;
 		iov++;
 	}
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	if (!ret)
 		ret = error;
 	return ret;
 }
 /*
  * vmsplice splices a user address range into a pipe. It can be thought of
  * as splice-from-memory, where the regular splice is splice-from-file (or
  * to file). In both cases the output is a pipe, naturally.
  */
 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	struct page *pages[PIPE_BUFFERS];
 	struct partial_page partial[PIPE_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
 		.flags = flags,
 		.ops = &user_page_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
 					    flags & SPLICE_F_GIFT);
 	if (spd.nr_pages <= 0)
 		return spd.nr_pages;
 	return splice_to_pipe(pipe, &spd);
 }
 /*
  * Note that vmsplice only really supports true splicing _from_ user memory
  * to a pipe, not the other way around. Splicing from user memory is a simple
  * operation that can be supported without any funky alignment restrictions
  * or nasty vm tricks. We simply map in the user memory and fill them into
  * a pipe. The reverse isn't quite as easy, though. There are two possible
  * solutions for that:
  *
  *	- memcpy() the data internally, at which point we might as well just
  *	  do a regular read() on the buffer anyway.
  *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
  *	  has restriction limitations on both ends of the pipe).
  *
  * Currently we punt and implement it as a normal copy, see pipe_to_user().
  *
  */
 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 		unsigned long, nr_segs, unsigned int, flags)
 {
 	struct file *file;
 	long error;
 	int fput;
 	if (unlikely(nr_segs > UIO_MAXIOV))
 		return -EINVAL;
 	else if (unlikely(!nr_segs))
 		return 0;
 	error = -EBADF;
 	file = fget_light(fd, &fput);
 	if (file) {
 		if (file->f_mode & FMODE_WRITE)
 			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
 		else if (file->f_mode & FMODE_READ)
 			error = vmsplice_to_user(file, iov, nr_segs, flags);
 		fput_light(file, fput);
 	}
 	return error;
 }
 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		int, fd_out, loff_t __user *, off_out,
 		size_t, len, unsigned int, flags)
 {
 	long error;
 	struct file *in, *out;
 	int fput_in, fput_out;
 	if (unlikely(!len))
 		return 0;
 	error = -EBADF;
 	in = fget_light(fd_in, &fput_in);
 	if (in) {
 		if (in->f_mode & FMODE_READ) {
 			out = fget_light(fd_out, &fput_out);
 			if (out) {
 				if (out->f_mode & FMODE_WRITE)
 					error = do_splice(in, off_in,
 							  out, off_out,
 							  len, flags);
 				fput_light(out, fput_out);
 			}
 		}
 		fput_light(in, fput_in);
 	}
 	return error;
 }
 /*
  * Make sure there's data to read. Wait for input if we can, otherwise
  * return an appropriate error.
  */
 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 	/*
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
 	if (pipe->nrbufs)
 		return 0;
 	ret = 0;
 	mutex_lock(&pipe->inode->i_mutex);
 	while (!pipe->nrbufs) {
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
 		if (!pipe->writers)
 			break;
 		if (!pipe->waiting_writers) {
 			if (flags & SPLICE_F_NONBLOCK) {
 				ret = -EAGAIN;
 				break;
 			}
 		}
 		pipe_wait(pipe);
 	}
 	mutex_unlock(&pipe->inode->i_mutex);
 	return ret;
 }
 /*
  * Make sure there's writeable room. Wait for room if we can, otherwise
  * return an appropriate error.
  */
 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 	/*
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
 	if (pipe->nrbufs < PIPE_BUFFERS)
 		return 0;
 	ret = 0;
 	mutex_lock(&pipe->inode->i_mutex);
 	while (pipe->nrbufs >= PIPE_BUFFERS) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
 			break;
 		}
 		if (flags & SPLICE_F_NONBLOCK) {
 			ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
 		pipe->waiting_writers++;
 		pipe_wait(pipe);
 		pipe->waiting_writers--;
 	}
 	mutex_unlock(&pipe->inode->i_mutex);
 	return ret;
 }
 /*
  * Link contents of ipipe to opipe.
  */
 static int link_pipe(struct pipe_inode_info *ipipe,
 		     struct pipe_inode_info *opipe,
 		     size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
 	int ret = 0, i = 0, nbuf;
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
 	 * grabbing by inode address. Otherwise two different processes
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
 	inode_double_lock(ipipe->inode, opipe->inode);
 	do {
 		if (!opipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
 		/*
 		 * If we have iterated all input buffers or ran out of
 		 * output room, break.
 		 */
 		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
 			break;
 		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
 		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
 		/*
 		 * Get a reference to this pipe buffer,
 		 * so we can copy the contents over.
 		 */
 		ibuf->ops->get(ipipe, ibuf);
 		obuf = opipe->bufs + nbuf;
 		*obuf = *ibuf;
 		/*
 		 * Don't inherit the gift flag, we need to
 		 * prevent multiple steals of this page.
 		 */
 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 		if (obuf->len > len)
 			obuf->len = len;
 		opipe->nrbufs++;
 		ret += obuf->len;
 		len -= obuf->len;
 		i++;
 	} while (len);
 	/*
 	 * return EAGAIN if we have the potential of some data in the
 	 * future, otherwise just return 0
 	 */
 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
 		ret = -EAGAIN;
 	inode_double_unlock(ipipe->inode, opipe->inode);
 	/*
 	 * If we put data in the output pipe, wakeup any potential readers.
 	 */
 	if (ret > 0) {
 		smp_mb();
 		if (waitqueue_active(&opipe->wait))
 			wake_up_interruptible(&opipe->wait);
 		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	return ret;
 }
 /*
  * This is a tee(1) implementation that works on pipes. It doesn't copy
  * any data, it simply references the 'in' pages on the 'out' pipe.
  * The 'flags' used are the SPLICE_F_* variants, currently the only
  * applicable one is SPLICE_F_NONBLOCK.
  */
 static long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags)
 {
 	struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
 	struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
 	int ret = -EINVAL;
 	/*
 	 * Duplicate the contents of ipipe to opipe without actually
 	 * copying the data.
 	 */
 	if (ipipe && opipe && ipipe != opipe) {
 		/*
 		 * Keep going, unless we encounter an error. The ipipe/opipe
 		 * ordering doesn't really matter.
 		 */
 		ret = link_ipipe_prep(ipipe, flags);
 		if (!ret) {
 			ret = link_opipe_prep(opipe, flags);
 			if (!ret)
 				ret = link_pipe(ipipe, opipe, len, flags);
 		}
 	}
 	return ret;
 }
 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
 	struct file *in;
 	int error, fput_in;
 	if (unlikely(!len))
 		return 0;
 	error = -EBADF;
 	in = fget_light(fdin, &fput_in);
 	if (in) {
 		if (in->f_mode & FMODE_READ) {
 			int fput_out;
 			struct file *out = fget_light(fdout, &fput_out);
 			if (out) {
 				if (out->f_mode & FMODE_WRITE)
 					error = do_tee(in, out, len, flags);
 				fput_light(out, fput_out);
 			}
 		}
  		fput_light(in, fput_in);
  	}
 	return error;
 }