Eric Lee / smarc-ti-linux-kernel

1

/* -*- mode: c; c-basic-offset: 8; -*-

1

/* -*- mode: c; c-basic-offset: 8; -*-

2

* vim: noexpandtab sw=8 ts=8 sts=0:

2

* vim: noexpandtab sw=8 ts=8 sts=0:

3

*

3

*

4

* file.c

4

* file.c

5

*

5

*

6

* File open, close, extend, truncate

6

* File open, close, extend, truncate

7

*

7

*

8

9

*

9

*

10

* This program is free software; you can redistribute it and/or

10

* This program is free software; you can redistribute it and/or

11

* modify it under the terms of the GNU General Public

11

* modify it under the terms of the GNU General Public

12

* License as published by the Free Software Foundation; either

12

* License as published by the Free Software Foundation; either

13

* version 2 of the License, or (at your option) any later version.

13

* version 2 of the License, or (at your option) any later version.

14

*

14

*

15

* This program is distributed in the hope that it will be useful,

15

* This program is distributed in the hope that it will be useful,

16

* but WITHOUT ANY WARRANTY; without even the implied warranty of

16

* but WITHOUT ANY WARRANTY; without even the implied warranty of

17

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

17

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

18

* General Public License for more details.

18

* General Public License for more details.

19

*

19

*

20

* You should have received a copy of the GNU General Public

20

* You should have received a copy of the GNU General Public

21

* License along with this program; if not, write to the

21

* License along with this program; if not, write to the

22

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

22

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

23

* Boston, MA 021110-1307, USA.

23

* Boston, MA 021110-1307, USA.

24

*/

24

*/

25

26

#include <linux/capability.h>

26

#include <linux/capability.h>

27

#include <linux/fs.h>

27

#include <linux/fs.h>

28

#include <linux/types.h>

28

#include <linux/types.h>

29

#include <linux/slab.h>

29

#include <linux/slab.h>

30

#include <linux/highmem.h>

30

#include <linux/highmem.h>

31

#include <linux/pagemap.h>

31

#include <linux/pagemap.h>

32

#include <linux/uio.h>

32

#include <linux/uio.h>

33

34

#define MLOG_MASK_PREFIX ML_INODE

34

#define MLOG_MASK_PREFIX ML_INODE

35

#include <cluster/masklog.h>

35

#include <cluster/masklog.h>

36

37

#include "ocfs2.h"

37

#include "ocfs2.h"

38

39

#include "alloc.h"

39

#include "alloc.h"

40

#include "aops.h"

40

#include "aops.h"

41

#include "dir.h"

41

#include "dir.h"

42

#include "dlmglue.h"

42

#include "dlmglue.h"

43

#include "extent_map.h"

43

#include "extent_map.h"

44

#include "file.h"

44

#include "file.h"

45

#include "sysfile.h"

45

#include "sysfile.h"

46

#include "inode.h"

46

#include "inode.h"

47

#include "journal.h"

47

#include "journal.h"

48

#include "mmap.h"

48

#include "mmap.h"

49

#include "suballoc.h"

49

#include "suballoc.h"

50

#include "super.h"

50

#include "super.h"

51

52

#include "buffer_head_io.h"

52

#include "buffer_head_io.h"

53

54

static int ocfs2_sync_inode(struct inode *inode)

54

static int ocfs2_sync_inode(struct inode *inode)

55

{

55

{

56

filemap_fdatawrite(inode->i_mapping);

56

filemap_fdatawrite(inode->i_mapping);

57

return sync_mapping_buffers(inode->i_mapping);

57

return sync_mapping_buffers(inode->i_mapping);

58

}

58

}

59

60

static int ocfs2_file_open(struct inode *inode, struct file *file)

60

static int ocfs2_file_open(struct inode *inode, struct file *file)

61

{

61

{

62

int status;

62

int status;

63

int mode = file->f_flags;

63

int mode = file->f_flags;

64

struct ocfs2_inode_info *oi = OCFS2_I(inode);

64

struct ocfs2_inode_info *oi = OCFS2_I(inode);

65

66

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

66

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

67

file->f_dentry->d_name.len, file->f_dentry->d_name.name);

67

file->f_dentry->d_name.len, file->f_dentry->d_name.name);

68

69

spin_lock(&oi->ip_lock);

69

spin_lock(&oi->ip_lock);

70

71

/* Check that the inode hasn't been wiped from disk by another

71

/* Check that the inode hasn't been wiped from disk by another

72

* node. If it hasn't then we're safe as long as we hold the

72

* node. If it hasn't then we're safe as long as we hold the

73

* spin lock until our increment of open count. */

73

* spin lock until our increment of open count. */

74

if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {

74

if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {

75

spin_unlock(&oi->ip_lock);

75

spin_unlock(&oi->ip_lock);

76

77

status = -ENOENT;

77

status = -ENOENT;

78

goto leave;

78

goto leave;

79

}

79

}

80

81

if (mode & O_DIRECT)

81

if (mode & O_DIRECT)

82

oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

82

oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;

83

84

oi->ip_open_count++;

84

oi->ip_open_count++;

85

spin_unlock(&oi->ip_lock);

85

spin_unlock(&oi->ip_lock);

86

status = 0;

86

status = 0;

87

leave:

87

leave:

88

mlog_exit(status);

88

mlog_exit(status);

89

return status;

89

return status;

90

}

90

}

91

92

static int ocfs2_file_release(struct inode *inode, struct file *file)

92

static int ocfs2_file_release(struct inode *inode, struct file *file)

93

{

93

{

94

struct ocfs2_inode_info *oi = OCFS2_I(inode);

94

struct ocfs2_inode_info *oi = OCFS2_I(inode);

95

96

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

96

mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,

97

file->f_dentry->d_name.len,

97

file->f_dentry->d_name.len,

98

file->f_dentry->d_name.name);

98

file->f_dentry->d_name.name);

99

100

spin_lock(&oi->ip_lock);

100

spin_lock(&oi->ip_lock);

101

if (!--oi->ip_open_count)

101

if (!--oi->ip_open_count)

102

oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;

102

oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;

103

spin_unlock(&oi->ip_lock);

103

spin_unlock(&oi->ip_lock);

104

105

mlog_exit(0);

105

mlog_exit(0);

106

107

return 0;

107

return 0;

108

}

108

}

109

110

static int ocfs2_sync_file(struct file *file,

110

static int ocfs2_sync_file(struct file *file,

111

struct dentry *dentry,

111

struct dentry *dentry,

112

int datasync)

112

int datasync)

113

{

113

{

114

int err = 0;

114

int err = 0;

115

journal_t *journal;

115

journal_t *journal;

116

struct inode *inode = dentry->d_inode;

116

struct inode *inode = dentry->d_inode;

117

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

117

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

118

119

mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,

119

mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,

120

dentry->d_name.len, dentry->d_name.name);

120

dentry->d_name.len, dentry->d_name.name);

121

122

err = ocfs2_sync_inode(dentry->d_inode);

122

err = ocfs2_sync_inode(dentry->d_inode);

123

if (err)

123

if (err)

124

goto bail;

124

goto bail;

125

126

journal = osb->journal->j_journal;

126

journal = osb->journal->j_journal;

127

err = journal_force_commit(journal);

127

err = journal_force_commit(journal);

128

129

bail:

129

bail:

130

mlog_exit(err);

130

mlog_exit(err);

131

132

return (err < 0) ? -EIO : 0;

132

return (err < 0) ? -EIO : 0;

133

}

133

}

134

135

int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,

135

int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,

136

struct inode *inode,

136

struct inode *inode,

137

struct buffer_head *fe_bh,

137

struct buffer_head *fe_bh,

138

u64 new_i_size)

138

u64 new_i_size)

139

{

139

{

140

int status;

140

int status;

141

142

mlog_entry_void();

142

mlog_entry_void();

143

i_size_write(inode, new_i_size);

143

i_size_write(inode, new_i_size);

144

inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);

144

inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);

145

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

145

inode->i_ctime = inode->i_mtime = CURRENT_TIME;

146

147

status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);

147

status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);

148

if (status < 0) {

148

if (status < 0) {

149

mlog_errno(status);

149

mlog_errno(status);

150

goto bail;

150

goto bail;

151

}

151

}

152

153

bail:

153

bail:

154

mlog_exit(status);

154

mlog_exit(status);

155

return status;

155

return status;

156

}

156

}

157

158

static int ocfs2_simple_size_update(struct inode *inode,

158

static int ocfs2_simple_size_update(struct inode *inode,

159

struct buffer_head *di_bh,

159

struct buffer_head *di_bh,

160

u64 new_i_size)

160

u64 new_i_size)

161

{

161

{

162

int ret;

162

int ret;

163

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

163

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

164

struct ocfs2_journal_handle *handle = NULL;

164

struct ocfs2_journal_handle *handle = NULL;

165

166

handle = ocfs2_start_trans(osb, NULL,

166

handle = ocfs2_start_trans(osb, NULL,

167

OCFS2_INODE_UPDATE_CREDITS);

167

OCFS2_INODE_UPDATE_CREDITS);

168

if (handle == NULL) {

168

if (handle == NULL) {

169

ret = -ENOMEM;

169

ret = -ENOMEM;

170

mlog_errno(ret);

170

mlog_errno(ret);

171

goto out;

171

goto out;

172

}

172

}

173

174

ret = ocfs2_set_inode_size(handle, inode, di_bh,

174

ret = ocfs2_set_inode_size(handle, inode, di_bh,

175

new_i_size);

175

new_i_size);

176

if (ret < 0)

176

if (ret < 0)

177

mlog_errno(ret);

177

mlog_errno(ret);

178

179

ocfs2_commit_trans(handle);

179

ocfs2_commit_trans(handle);

180

out:

180

out:

181

return ret;

181

return ret;

182

}

182

}

183

184

static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,

184

static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,

185

struct inode *inode,

185

struct inode *inode,

186

struct buffer_head *fe_bh,

186

struct buffer_head *fe_bh,

187

u64 new_i_size)

187

u64 new_i_size)

188

{

188

{

189

int status;

189

int status;

190

struct ocfs2_journal_handle *handle;

190

struct ocfs2_journal_handle *handle;

191

192

mlog_entry_void();

192

mlog_entry_void();

193

194

/* TODO: This needs to actually orphan the inode in this

194

/* TODO: This needs to actually orphan the inode in this

195

* transaction. */

195

* transaction. */

196

197

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

197

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

198

if (IS_ERR(handle)) {

198

if (IS_ERR(handle)) {

199

status = PTR_ERR(handle);

199

status = PTR_ERR(handle);

200

mlog_errno(status);

200

mlog_errno(status);

201

goto out;

201

goto out;

202

}

202

}

203

204

status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);

204

status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);

205

if (status < 0)

205

if (status < 0)

206

mlog_errno(status);

206

mlog_errno(status);

207

208

ocfs2_commit_trans(handle);

208

ocfs2_commit_trans(handle);

209

out:

209

out:

210

mlog_exit(status);

210

mlog_exit(status);

211

return status;

211

return status;

212

}

212

}

213

214

static int ocfs2_truncate_file(struct inode *inode,

214

static int ocfs2_truncate_file(struct inode *inode,

215

struct buffer_head *di_bh,

215

struct buffer_head *di_bh,

216

u64 new_i_size)

216

u64 new_i_size)

217

{

217

{

218

int status = 0;

218

int status = 0;

219

struct ocfs2_dinode *fe = NULL;

219

struct ocfs2_dinode *fe = NULL;

220

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

220

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

221

struct ocfs2_truncate_context *tc = NULL;

221

struct ocfs2_truncate_context *tc = NULL;

222

223

mlog_entry("(inode = %llu, new_i_size = %llu\n",

223

mlog_entry("(inode = %llu, new_i_size = %llu\n",

224

(unsigned long long)OCFS2_I(inode)->ip_blkno,

224

(unsigned long long)OCFS2_I(inode)->ip_blkno,

225

(unsigned long long)new_i_size);

225

(unsigned long long)new_i_size);

226

227

truncate_inode_pages(inode->i_mapping, new_i_size);

227

truncate_inode_pages(inode->i_mapping, new_i_size);

228

229

fe = (struct ocfs2_dinode *) di_bh->b_data;

229

fe = (struct ocfs2_dinode *) di_bh->b_data;

230

if (!OCFS2_IS_VALID_DINODE(fe)) {

230

if (!OCFS2_IS_VALID_DINODE(fe)) {

231

OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);

231

OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);

232

status = -EIO;

232

status = -EIO;

233

goto bail;

233

goto bail;

234

}

234

}

235

236

mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),

236

mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),

237

"Inode %llu, inode i_size = %lld != di "

237

"Inode %llu, inode i_size = %lld != di "

238

"i_size = %llu, i_flags = 0x%x\n",

238

"i_size = %llu, i_flags = 0x%x\n",

239

(unsigned long long)OCFS2_I(inode)->ip_blkno,

239

(unsigned long long)OCFS2_I(inode)->ip_blkno,

240

i_size_read(inode),

240

i_size_read(inode),

241

(unsigned long long)le64_to_cpu(fe->i_size),

241

(unsigned long long)le64_to_cpu(fe->i_size),

242

le32_to_cpu(fe->i_flags));

242

le32_to_cpu(fe->i_flags));

243

244

if (new_i_size > le64_to_cpu(fe->i_size)) {

244

if (new_i_size > le64_to_cpu(fe->i_size)) {

245

mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",

245

mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",

246

(unsigned long long)le64_to_cpu(fe->i_size),

246

(unsigned long long)le64_to_cpu(fe->i_size),

247

(unsigned long long)new_i_size);

247

(unsigned long long)new_i_size);

248

status = -EINVAL;

248

status = -EINVAL;

249

mlog_errno(status);

249

mlog_errno(status);

250

goto bail;

250

goto bail;

251

}

251

}

252

253

mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",

253

mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",

254

(unsigned long long)le64_to_cpu(fe->i_blkno),

254

(unsigned long long)le64_to_cpu(fe->i_blkno),

255

(unsigned long long)le64_to_cpu(fe->i_size),

255

(unsigned long long)le64_to_cpu(fe->i_size),

256

(unsigned long long)new_i_size);

256

(unsigned long long)new_i_size);

257

258

/* lets handle the simple truncate cases before doing any more

258

/* lets handle the simple truncate cases before doing any more

259

* cluster locking. */

259

* cluster locking. */

260

if (new_i_size == le64_to_cpu(fe->i_size))

260

if (new_i_size == le64_to_cpu(fe->i_size))

261

goto bail;

261

goto bail;

262

263

/* This forces other nodes to sync and drop their pages. Do

263

/* This forces other nodes to sync and drop their pages. Do

264

* this even if we have a truncate without allocation change -

264

* this even if we have a truncate without allocation change -

265

* ocfs2 cluster sizes can be much greater than page size, so

265

* ocfs2 cluster sizes can be much greater than page size, so

266

* we have to truncate them anyway. */

266

* we have to truncate them anyway. */

267

status = ocfs2_data_lock(inode, 1);

267

status = ocfs2_data_lock(inode, 1);

268

if (status < 0) {

268

if (status < 0) {

269

mlog_errno(status);

269

mlog_errno(status);

270

goto bail;

270

goto bail;

271

}

271

}

272

ocfs2_data_unlock(inode, 1);

272

ocfs2_data_unlock(inode, 1);

273

274

if (le32_to_cpu(fe->i_clusters) ==

274

if (le32_to_cpu(fe->i_clusters) ==

275

ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {

275

ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {

276

mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",

276

mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",

277

fe->i_clusters);

277

fe->i_clusters);

278

/* No allocation change is required, so lets fast path

278

/* No allocation change is required, so lets fast path

279

* this truncate. */

279

* this truncate. */

280

status = ocfs2_simple_size_update(inode, di_bh, new_i_size);

280

status = ocfs2_simple_size_update(inode, di_bh, new_i_size);

281

if (status < 0)

281

if (status < 0)

282

mlog_errno(status);

282

mlog_errno(status);

283

goto bail;

283

goto bail;

284

}

284

}

285

286

/* alright, we're going to need to do a full blown alloc size

286

/* alright, we're going to need to do a full blown alloc size

287

* change. Orphan the inode so that recovery can complete the

287

* change. Orphan the inode so that recovery can complete the

288

* truncate if necessary. This does the task of marking

288

* truncate if necessary. This does the task of marking

289

* i_size. */

289

* i_size. */

290

status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);

290

status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);

291

if (status < 0) {

291

if (status < 0) {

292

mlog_errno(status);

292

mlog_errno(status);

293

goto bail;

293

goto bail;

294

}

294

}

295

296

status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);

296

status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);

297

if (status < 0) {

297

if (status < 0) {

298

mlog_errno(status);

298

mlog_errno(status);

299

goto bail;

299

goto bail;

300

}

300

}

301

302

status = ocfs2_commit_truncate(osb, inode, di_bh, tc);

302

status = ocfs2_commit_truncate(osb, inode, di_bh, tc);

303

if (status < 0) {

303

if (status < 0) {

304

mlog_errno(status);

304

mlog_errno(status);

305

goto bail;

305

goto bail;

306

}

306

}

307

308

/* TODO: orphan dir cleanup here. */

308

/* TODO: orphan dir cleanup here. */

309

bail:

309

bail:

310

311

mlog_exit(status);

311

mlog_exit(status);

312

return status;

312

return status;

313

}

313

}

314

315

/*

315

/*

316

* extend allocation only here.

316

* extend allocation only here.

317

* we'll update all the disk stuff, and oip->alloc_size

317

* we'll update all the disk stuff, and oip->alloc_size

318

*

318

*

319

* expect stuff to be locked, a transaction started and enough data /

319

* expect stuff to be locked, a transaction started and enough data /

320

* metadata reservations in the contexts.

320

* metadata reservations in the contexts.

321

*

321

*

322

* Will return -EAGAIN, and a reason if a restart is needed.

322

* Will return -EAGAIN, and a reason if a restart is needed.

323

* If passed in, *reason will always be set, even in error.

323

* If passed in, *reason will always be set, even in error.

324

*/

324

*/

325

int ocfs2_do_extend_allocation(struct ocfs2_super *osb,

325

int ocfs2_do_extend_allocation(struct ocfs2_super *osb,

326

struct inode *inode,

326

struct inode *inode,

327

u32 clusters_to_add,

327

u32 clusters_to_add,

328

struct buffer_head *fe_bh,

328

struct buffer_head *fe_bh,

329

struct ocfs2_journal_handle *handle,

329

struct ocfs2_journal_handle *handle,

330

struct ocfs2_alloc_context *data_ac,

330

struct ocfs2_alloc_context *data_ac,

331

struct ocfs2_alloc_context *meta_ac,

331

struct ocfs2_alloc_context *meta_ac,

332

enum ocfs2_alloc_restarted *reason_ret)

332

enum ocfs2_alloc_restarted *reason_ret)

333

{

333

{

334

int status = 0;

334

int status = 0;

335

int free_extents;

335

int free_extents;

336

struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;

336

struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;

337

enum ocfs2_alloc_restarted reason = RESTART_NONE;

337

enum ocfs2_alloc_restarted reason = RESTART_NONE;

338

u32 bit_off, num_bits;

338

u32 bit_off, num_bits;

339

u64 block;

339

u64 block;

340

341

BUG_ON(!clusters_to_add);

341

BUG_ON(!clusters_to_add);

342

343

free_extents = ocfs2_num_free_extents(osb, inode, fe);

343

free_extents = ocfs2_num_free_extents(osb, inode, fe);

344

if (free_extents < 0) {

344

if (free_extents < 0) {

345

status = free_extents;

345

status = free_extents;

346

mlog_errno(status);

346

mlog_errno(status);

347

goto leave;

347

goto leave;

348

}

348

}

349

350

/* there are two cases which could cause us to EAGAIN in the

350

/* there are two cases which could cause us to EAGAIN in the

351

* we-need-more-metadata case:

351

* we-need-more-metadata case:

352

* 1) we haven't reserved *any*

352

* 1) we haven't reserved *any*

353

* 2) we are so fragmented, we've needed to add metadata too

353

* 2) we are so fragmented, we've needed to add metadata too

354

* many times. */

354

* many times. */

355

if (!free_extents && !meta_ac) {

355

if (!free_extents && !meta_ac) {

356

mlog(0, "we haven't reserved any metadata!\n");

356

mlog(0, "we haven't reserved any metadata!\n");

357

status = -EAGAIN;

357

status = -EAGAIN;

358

reason = RESTART_META;

358

reason = RESTART_META;

359

goto leave;

359

goto leave;

360

} else if ((!free_extents)

360

} else if ((!free_extents)

361

&& (ocfs2_alloc_context_bits_left(meta_ac)

361

&& (ocfs2_alloc_context_bits_left(meta_ac)

362

< ocfs2_extend_meta_needed(fe))) {

362

< ocfs2_extend_meta_needed(fe))) {

363

mlog(0, "filesystem is really fragmented...\n");

363

mlog(0, "filesystem is really fragmented...\n");

364

status = -EAGAIN;

364

status = -EAGAIN;

365

reason = RESTART_META;

365

reason = RESTART_META;

366

goto leave;

366

goto leave;

367

}

367

}

368

369

status = ocfs2_claim_clusters(osb, handle, data_ac, 1,

369

status = ocfs2_claim_clusters(osb, handle, data_ac, 1,

370

&bit_off, &num_bits);

370

&bit_off, &num_bits);

371

if (status < 0) {

371

if (status < 0) {

372

if (status != -ENOSPC)

372

if (status != -ENOSPC)

373

mlog_errno(status);

373

mlog_errno(status);

374

goto leave;

374

goto leave;

375

}

375

}

376

377

BUG_ON(num_bits > clusters_to_add);

377

BUG_ON(num_bits > clusters_to_add);

378

379

/* reserve our write early -- insert_extent may update the inode */

379

/* reserve our write early -- insert_extent may update the inode */

380

status = ocfs2_journal_access(handle, inode, fe_bh,

380

status = ocfs2_journal_access(handle, inode, fe_bh,

381

OCFS2_JOURNAL_ACCESS_WRITE);

381

OCFS2_JOURNAL_ACCESS_WRITE);

382

if (status < 0) {

382

if (status < 0) {

383

mlog_errno(status);

383

mlog_errno(status);

384

goto leave;

384

goto leave;

385

}

385

}

386

387

block = ocfs2_clusters_to_blocks(osb->sb, bit_off);

387

block = ocfs2_clusters_to_blocks(osb->sb, bit_off);

388

mlog(0, "Allocating %u clusters at block %u for inode %llu\n",

388

mlog(0, "Allocating %u clusters at block %u for inode %llu\n",

389

num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);

389

num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);

390

status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,

390

status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,

391

num_bits, meta_ac);

391

num_bits, meta_ac);

392

if (status < 0) {

392

if (status < 0) {

393

mlog_errno(status);

393

mlog_errno(status);

394

goto leave;

394

goto leave;

395

}

395

}

396

397

le32_add_cpu(&fe->i_clusters, num_bits);

397

le32_add_cpu(&fe->i_clusters, num_bits);

398

spin_lock(&OCFS2_I(inode)->ip_lock);

398

spin_lock(&OCFS2_I(inode)->ip_lock);

399

OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);

399

OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);

400

spin_unlock(&OCFS2_I(inode)->ip_lock);

400

spin_unlock(&OCFS2_I(inode)->ip_lock);

401

402

status = ocfs2_journal_dirty(handle, fe_bh);

402

status = ocfs2_journal_dirty(handle, fe_bh);

403

if (status < 0) {

403

if (status < 0) {

404

mlog_errno(status);

404

mlog_errno(status);

405

goto leave;

405

goto leave;

406

}

406

}

407

408

clusters_to_add -= num_bits;

408

clusters_to_add -= num_bits;

409

410

if (clusters_to_add) {

410

if (clusters_to_add) {

411

mlog(0, "need to alloc once more, clusters = %u, wanted = "

411

mlog(0, "need to alloc once more, clusters = %u, wanted = "

412

"%u\n", fe->i_clusters, clusters_to_add);

412

"%u\n", fe->i_clusters, clusters_to_add);

413

status = -EAGAIN;

413

status = -EAGAIN;

414

reason = RESTART_TRANS;

414

reason = RESTART_TRANS;

415

}

415

}

416

417

leave:

417

leave:

418

mlog_exit(status);

418

mlog_exit(status);

419

if (reason_ret)

419

if (reason_ret)

420

*reason_ret = reason;

420

*reason_ret = reason;

421

return status;

421

return status;

422

}

422

}

423

424

static int ocfs2_extend_allocation(struct inode *inode,

424

static int ocfs2_extend_allocation(struct inode *inode,

425

u32 clusters_to_add)

425

u32 clusters_to_add)

426

{

426

{

427

int status = 0;

427

int status = 0;

428

int restart_func = 0;

428

int restart_func = 0;

429

int drop_alloc_sem = 0;

429

int drop_alloc_sem = 0;

430

int credits, num_free_extents;

430

int credits, num_free_extents;

431

u32 prev_clusters;

431

u32 prev_clusters;

432

struct buffer_head *bh = NULL;

432

struct buffer_head *bh = NULL;

433

struct ocfs2_dinode *fe = NULL;

433

struct ocfs2_dinode *fe = NULL;

434

struct ocfs2_journal_handle *handle = NULL;

434

struct ocfs2_journal_handle *handle = NULL;

435

struct ocfs2_alloc_context *data_ac = NULL;

435

struct ocfs2_alloc_context *data_ac = NULL;

436

struct ocfs2_alloc_context *meta_ac = NULL;

436

struct ocfs2_alloc_context *meta_ac = NULL;

437

enum ocfs2_alloc_restarted why;

437

enum ocfs2_alloc_restarted why;

438

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

438

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

439

440

mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

440

mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

441

442

status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,

442

status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,

443

OCFS2_BH_CACHED, inode);

443

OCFS2_BH_CACHED, inode);

444

if (status < 0) {

444

if (status < 0) {

445

mlog_errno(status);

445

mlog_errno(status);

446

goto leave;

446

goto leave;

447

}

447

}

448

449

fe = (struct ocfs2_dinode *) bh->b_data;

449

fe = (struct ocfs2_dinode *) bh->b_data;

450

if (!OCFS2_IS_VALID_DINODE(fe)) {

450

if (!OCFS2_IS_VALID_DINODE(fe)) {

451

OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);

451

OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);

452

status = -EIO;

452

status = -EIO;

453

goto leave;

453

goto leave;

454

}

454

}

455

456

restart_all:

456

restart_all:

457

BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

457

BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

458

459

mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "

459

mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "

460

"clusters_to_add = %u\n",

460

"clusters_to_add = %u\n",

461

(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),

461

(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),

462

fe->i_clusters, clusters_to_add);

462

fe->i_clusters, clusters_to_add);

463

464

handle = ocfs2_alloc_handle(osb);

464

handle = ocfs2_alloc_handle(osb);

465

if (handle == NULL) {

465

if (handle == NULL) {

466

status = -ENOMEM;

466

status = -ENOMEM;

467

mlog_errno(status);

467

mlog_errno(status);

468

goto leave;

468

goto leave;

469

}

469

}

470

471

num_free_extents = ocfs2_num_free_extents(osb,

471

num_free_extents = ocfs2_num_free_extents(osb,

472

inode,

472

inode,

473

fe);

473

fe);

474

if (num_free_extents < 0) {

474

if (num_free_extents < 0) {

475

status = num_free_extents;

475

status = num_free_extents;

476

mlog_errno(status);

476

mlog_errno(status);

477

goto leave;

477

goto leave;

478

}

478

}

479

480

if (!num_free_extents) {

480

if (!num_free_extents) {

481

status = ocfs2_reserve_new_metadata(osb,

481

status = ocfs2_reserve_new_metadata(osb,

482

handle,

482

handle,

483

fe,

483

fe,

484

&meta_ac);

484

&meta_ac);

485

if (status < 0) {

485

if (status < 0) {

486

if (status != -ENOSPC)

486

if (status != -ENOSPC)

487

mlog_errno(status);

487

mlog_errno(status);

488

goto leave;

488

goto leave;

489

}

489

}

490

}

490

}

491

492

status = ocfs2_reserve_clusters(osb,

492

status = ocfs2_reserve_clusters(osb,

493

handle,

493

handle,

494

clusters_to_add,

494

clusters_to_add,

495

&data_ac);

495

&data_ac);

496

if (status < 0) {

496

if (status < 0) {

497

if (status != -ENOSPC)

497

if (status != -ENOSPC)

498

mlog_errno(status);

498

mlog_errno(status);

499

goto leave;

499

goto leave;

500

}

500

}

501

502

/* blocks peope in read/write from reading our allocation

502

/* blocks peope in read/write from reading our allocation

503

* until we're done changing it. We depend on i_mutex to block

503

* until we're done changing it. We depend on i_mutex to block

504

* other extend/truncate calls while we're here. Ordering wrt

504

* other extend/truncate calls while we're here. Ordering wrt

505

* start_trans is important here -- always do it before! */

505

* start_trans is important here -- always do it before! */

506

down_write(&OCFS2_I(inode)->ip_alloc_sem);

506

down_write(&OCFS2_I(inode)->ip_alloc_sem);

507

drop_alloc_sem = 1;

507

drop_alloc_sem = 1;

508

509

credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);

509

credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);

510

handle = ocfs2_start_trans(osb, handle, credits);

510

handle = ocfs2_start_trans(osb, handle, credits);

511

if (IS_ERR(handle)) {

511

if (IS_ERR(handle)) {

512

status = PTR_ERR(handle);

512

status = PTR_ERR(handle);

513

handle = NULL;

513

handle = NULL;

514

mlog_errno(status);

514

mlog_errno(status);

515

goto leave;

515

goto leave;

516

}

516

}

517

518

restarted_transaction:

518

restarted_transaction:

519

/* reserve a write to the file entry early on - that we if we

519

/* reserve a write to the file entry early on - that we if we

520

* run out of credits in the allocation path, we can still

520

* run out of credits in the allocation path, we can still

521

* update i_size. */

521

* update i_size. */

522

status = ocfs2_journal_access(handle, inode, bh,

522

status = ocfs2_journal_access(handle, inode, bh,

523

OCFS2_JOURNAL_ACCESS_WRITE);

523

OCFS2_JOURNAL_ACCESS_WRITE);

524

if (status < 0) {

524

if (status < 0) {

525

mlog_errno(status);

525

mlog_errno(status);

526

goto leave;

526

goto leave;

527

}

527

}

528

529

prev_clusters = OCFS2_I(inode)->ip_clusters;

529

prev_clusters = OCFS2_I(inode)->ip_clusters;

530

531

status = ocfs2_do_extend_allocation(osb,

531

status = ocfs2_do_extend_allocation(osb,

532

inode,

532

inode,

533

clusters_to_add,

533

clusters_to_add,

534

bh,

534

bh,

535

handle,

535

handle,

536

data_ac,

536

data_ac,

537

meta_ac,

537

meta_ac,

538

&why);

538

&why);

539

if ((status < 0) && (status != -EAGAIN)) {

539

if ((status < 0) && (status != -EAGAIN)) {

540

if (status != -ENOSPC)

540

if (status != -ENOSPC)

541

mlog_errno(status);

541

mlog_errno(status);

542

goto leave;

542

goto leave;

543

}

543

}

544

545

status = ocfs2_journal_dirty(handle, bh);

545

status = ocfs2_journal_dirty(handle, bh);

546

if (status < 0) {

546

if (status < 0) {

547

mlog_errno(status);

547

mlog_errno(status);

548

goto leave;

548

goto leave;

549

}

549

}

550

551

spin_lock(&OCFS2_I(inode)->ip_lock);

551

spin_lock(&OCFS2_I(inode)->ip_lock);

552

clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);

552

clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);

553

spin_unlock(&OCFS2_I(inode)->ip_lock);

553

spin_unlock(&OCFS2_I(inode)->ip_lock);

554

555

if (why != RESTART_NONE && clusters_to_add) {

555

if (why != RESTART_NONE && clusters_to_add) {

556

if (why == RESTART_META) {

556

if (why == RESTART_META) {

557

mlog(0, "restarting function.\n");

557

mlog(0, "restarting function.\n");

558

restart_func = 1;

558

restart_func = 1;

559

} else {

559

} else {

560

BUG_ON(why != RESTART_TRANS);

560

BUG_ON(why != RESTART_TRANS);

561

562

mlog(0, "restarting transaction.\n");

562

mlog(0, "restarting transaction.\n");

563

/* TODO: This can be more intelligent. */

563

/* TODO: This can be more intelligent. */

564

credits = ocfs2_calc_extend_credits(osb->sb,

564

credits = ocfs2_calc_extend_credits(osb->sb,

565

fe,

565

fe,

566

clusters_to_add);

566

clusters_to_add);

567

status = ocfs2_extend_trans(handle, credits);

567

status = ocfs2_extend_trans(handle, credits);

568

if (status < 0) {

568

if (status < 0) {

569

/* handle still has to be committed at

569

/* handle still has to be committed at

570

* this point. */

570

* this point. */

571

status = -ENOMEM;

571

status = -ENOMEM;

572

mlog_errno(status);

572

mlog_errno(status);

573

goto leave;

573

goto leave;

574

}

574

}

575

goto restarted_transaction;

575

goto restarted_transaction;

576

}

576

}

577

}

577

}

578

579

mlog(0, "fe: i_clusters = %u, i_size=%llu\n",

579

mlog(0, "fe: i_clusters = %u, i_size=%llu\n",

580

fe->i_clusters, (unsigned long long)fe->i_size);

580

fe->i_clusters, (unsigned long long)fe->i_size);

581

mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",

581

mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",

582

OCFS2_I(inode)->ip_clusters, i_size_read(inode));

582

OCFS2_I(inode)->ip_clusters, i_size_read(inode));

583

584

leave:

584

leave:

585

if (drop_alloc_sem) {

585

if (drop_alloc_sem) {

586

up_write(&OCFS2_I(inode)->ip_alloc_sem);

586

up_write(&OCFS2_I(inode)->ip_alloc_sem);

587

drop_alloc_sem = 0;

587

drop_alloc_sem = 0;

588

}

588

}

589

if (handle) {

589

if (handle) {

590

ocfs2_commit_trans(handle);

590

ocfs2_commit_trans(handle);

591

handle = NULL;

591

handle = NULL;

592

}

592

}

593

if (data_ac) {

593

if (data_ac) {

594

ocfs2_free_alloc_context(data_ac);

594

ocfs2_free_alloc_context(data_ac);

595

data_ac = NULL;

595

data_ac = NULL;

596

}

596

}

597

if (meta_ac) {

597

if (meta_ac) {

598

ocfs2_free_alloc_context(meta_ac);

598

ocfs2_free_alloc_context(meta_ac);

599

meta_ac = NULL;

599

meta_ac = NULL;

600

}

600

}

601

if ((!status) && restart_func) {

601

if ((!status) && restart_func) {

602

restart_func = 0;

602

restart_func = 0;

603

goto restart_all;

603

goto restart_all;

604

}

604

}

605

if (bh) {

605

if (bh) {

606

brelse(bh);

606

brelse(bh);

607

bh = NULL;

607

bh = NULL;

608

}

608

}

609

610

mlog_exit(status);

610

mlog_exit(status);

611

return status;

611

return status;

612

}

612

}

613

614

/* Some parts of this taken from generic_cont_expand, which turned out

614

/* Some parts of this taken from generic_cont_expand, which turned out

615

* to be too fragile to do exactly what we need without us having to

615

* to be too fragile to do exactly what we need without us having to

616

* worry about recursive locking in ->commit_write(). */

616

* worry about recursive locking in ->prepare_write() and

617

* ->commit_write(). */

617

static int ocfs2_write_zero_page(struct inode *inode,

618

static int ocfs2_write_zero_page(struct inode *inode,

618

u64 size)

619

u64 size)

619

{

620

{

620

struct address_space *mapping = inode->i_mapping;

621

struct address_space *mapping = inode->i_mapping;

621

struct page *page;

622

struct page *page;

622

unsigned long index;

623

unsigned long index;

623

unsigned int offset;

624

unsigned int offset;

624

struct ocfs2_journal_handle *handle = NULL;

625

struct ocfs2_journal_handle *handle = NULL;

625

int ret;

626

int ret;

626

627

offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */

628

offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */

628

/* ugh. in prepare/commit_write, if from==to==start of block, we

629

/* ugh. in prepare/commit_write, if from==to==start of block, we

629

** skip the prepare. make sure we never send an offset for the start

630

** skip the prepare. make sure we never send an offset for the start

630

** of a block

631

** of a block

631

*/

632

*/

632

if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {

633

if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {

633

offset++;

634

offset++;

634

}

635

}

635

index = size >> PAGE_CACHE_SHIFT;

636

index = size >> PAGE_CACHE_SHIFT;

636

637

page = grab_cache_page(mapping, index);

638

page = grab_cache_page(mapping, index);

638

if (!page) {

639

if (!page) {

639

ret = -ENOMEM;

640

ret = -ENOMEM;

640

mlog_errno(ret);

641

mlog_errno(ret);

641

goto out;

642

goto out;

642

}

643

}

643

644

ret = ocfs2_prepare_write(NULL, page, offset, offset);

645

ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);

645

if (ret < 0) {

646

if (ret < 0) {

646

mlog_errno(ret);

647

mlog_errno(ret);

647

goto out_unlock;

648

goto out_unlock;

648

}

649

}

649

650

if (ocfs2_should_order_data(inode)) {

651

if (ocfs2_should_order_data(inode)) {

651

handle = ocfs2_start_walk_page_trans(inode, page, offset,

652

handle = ocfs2_start_walk_page_trans(inode, page, offset,

652

offset);

653

offset);

653

if (IS_ERR(handle)) {

654

if (IS_ERR(handle)) {

654

ret = PTR_ERR(handle);

655

ret = PTR_ERR(handle);

655

handle = NULL;

656

handle = NULL;

656

goto out_unlock;

657

goto out_unlock;

657

}

658

}

658

}

659

}

659

660

/* must not update i_size! */

661

/* must not update i_size! */

661

ret = block_commit_write(page, offset, offset);

662

ret = block_commit_write(page, offset, offset);

662

if (ret < 0)

663

if (ret < 0)

663

mlog_errno(ret);

664

mlog_errno(ret);

664

else

665

else

665

ret = 0;

666

ret = 0;

666

667

if (handle)

668

if (handle)

668

ocfs2_commit_trans(handle);

669

ocfs2_commit_trans(handle);

669

out_unlock:

670

out_unlock:

670

unlock_page(page);

671

unlock_page(page);

671

page_cache_release(page);

672

page_cache_release(page);

672

out:

673

out:

673

return ret;

674

return ret;

674

}

675

}

675

676

static int ocfs2_zero_extend(struct inode *inode,

677

static int ocfs2_zero_extend(struct inode *inode,

677

u64 zero_to_size)

678

u64 zero_to_size)

678

{

679

{

679

int ret = 0;

680

int ret = 0;

680

u64 start_off;

681

u64 start_off;

681

struct super_block *sb = inode->i_sb;

682

struct super_block *sb = inode->i_sb;

682

683

start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));

684

start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));

684

while (start_off < zero_to_size) {

685

while (start_off < zero_to_size) {

685

ret = ocfs2_write_zero_page(inode, start_off);

686

ret = ocfs2_write_zero_page(inode, start_off);

686

if (ret < 0) {

687

if (ret < 0) {

687

mlog_errno(ret);

688

mlog_errno(ret);

688

goto out;

689

goto out;

689

}

690

}

690

691

start_off += sb->s_blocksize;

692

start_off += sb->s_blocksize;

692

}

693

}

693

694

out:

695

out:

695

return ret;

696

return ret;

696

}

697

}

697

698

699

/*

700

* A tail_to_skip value > 0 indicates that we're being called from

701

* ocfs2_file_aio_write(). This has the following implications:

702

*

703

* - we don't want to update i_size

704

* - di_bh will be NULL, which is fine because it's only used in the

705

* case where we want to update i_size.

706

* - ocfs2_zero_extend() will then only be filling the hole created

707

* between i_size and the start of the write.

708

*/

698

static int ocfs2_extend_file(struct inode *inode,

709

static int ocfs2_extend_file(struct inode *inode,

699

struct buffer_head *di_bh,

710

struct buffer_head *di_bh,

700

u64 new_i_size)

711

u64 new_i_size,

712

size_t tail_to_skip)

701

{

713

{

702

int ret = 0;

714

int ret = 0;

703

u32 clusters_to_add;

715

u32 clusters_to_add;

704

716

717

BUG_ON(!tail_to_skip && !di_bh);

718

705

/* setattr sometimes calls us like this. */

719

/* setattr sometimes calls us like this. */

706

if (new_i_size == 0)

720

if (new_i_size == 0)

707

goto out;

721

goto out;

708

722

709

if (i_size_read(inode) == new_i_size)

723

if (i_size_read(inode) == new_i_size)

710

goto out;

724

goto out;

711

BUG_ON(new_i_size < i_size_read(inode));

725

BUG_ON(new_i_size < i_size_read(inode));

712

726

713

clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -

727

clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -

714

OCFS2_I(inode)->ip_clusters;

728

OCFS2_I(inode)->ip_clusters;

715

729

716

if (clusters_to_add) {

730

if (clusters_to_add) {

717

ret = ocfs2_extend_allocation(inode, clusters_to_add);

731

/*

732

* protect the pages that ocfs2_zero_extend is going to

733

* be pulling into the page cache.. we do this before the

734

* metadata extend so that we don't get into the situation

735

* where we've extended the metadata but can't get the data

736

* lock to zero.

737

*/

738

ret = ocfs2_data_lock(inode, 1);

718

if (ret < 0) {

739

if (ret < 0) {

719

mlog_errno(ret);

740

mlog_errno(ret);

720

goto out;

741

goto out;

721

}

742

}

722

743

723

ret = ocfs2_zero_extend(inode, new_i_size);

744

ret = ocfs2_extend_allocation(inode, clusters_to_add);

724

if (ret < 0) {

745

if (ret < 0) {

725

mlog_errno(ret);

746

mlog_errno(ret);

726

goto out;

747

goto out_unlock;

727

}

748

}

728

}

729

749

730

/* No allocation required, we just use this helper to

750

ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);

731

* do a trivial update of i_size. */

751

if (ret < 0) {

732

ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);

752

mlog_errno(ret);

733

if (ret < 0) {

753

goto out_unlock;

734

mlog_errno(ret);

754

}

735

goto out;

736

}

755

}

737

756

757

if (!tail_to_skip) {

758

/* We're being called from ocfs2_setattr() which wants

759

* us to update i_size */

760

ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);

761

if (ret < 0)

762

mlog_errno(ret);

763

}

764

765

out_unlock:

766

if (clusters_to_add) /* this is the only case in which we lock */

767

ocfs2_data_unlock(inode, 1);

768

738

out:

769

out:

739

return ret;

770

return ret;

740

}

771

}

741

772

742

int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)

773

int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)

743

{

774

{

744

int status = 0, size_change;

775

int status = 0, size_change;

745

struct inode *inode = dentry->d_inode;

776

struct inode *inode = dentry->d_inode;

746

struct super_block *sb = inode->i_sb;

777

struct super_block *sb = inode->i_sb;

747

struct ocfs2_super *osb = OCFS2_SB(sb);

778

struct ocfs2_super *osb = OCFS2_SB(sb);

748

struct buffer_head *bh = NULL;

779

struct buffer_head *bh = NULL;

749

struct ocfs2_journal_handle *handle = NULL;

780

struct ocfs2_journal_handle *handle = NULL;

750

781

751

mlog_entry("(0x%p, '%.*s')\n", dentry,

782

mlog_entry("(0x%p, '%.*s')\n", dentry,

752

dentry->d_name.len, dentry->d_name.name);

783

dentry->d_name.len, dentry->d_name.name);

753

784

754

if (attr->ia_valid & ATTR_MODE)

785

if (attr->ia_valid & ATTR_MODE)

755

mlog(0, "mode change: %d\n", attr->ia_mode);

786

mlog(0, "mode change: %d\n", attr->ia_mode);

756

if (attr->ia_valid & ATTR_UID)

787

if (attr->ia_valid & ATTR_UID)

757

mlog(0, "uid change: %d\n", attr->ia_uid);

788

mlog(0, "uid change: %d\n", attr->ia_uid);

758

if (attr->ia_valid & ATTR_GID)

789

if (attr->ia_valid & ATTR_GID)

759

mlog(0, "gid change: %d\n", attr->ia_gid);

790

mlog(0, "gid change: %d\n", attr->ia_gid);

760

if (attr->ia_valid & ATTR_SIZE)

791

if (attr->ia_valid & ATTR_SIZE)

761

mlog(0, "size change...\n");

792

mlog(0, "size change...\n");

762

if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))

793

if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))

763

mlog(0, "time change...\n");

794

mlog(0, "time change...\n");

764

795

765

#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \

796

#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \

766

| ATTR_GID | ATTR_UID | ATTR_MODE)

797

| ATTR_GID | ATTR_UID | ATTR_MODE)

767

if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {

798

if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {

768

mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);

799

mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);

769

return 0;

800

return 0;

770

}

801

}

771

802

772

status = inode_change_ok(inode, attr);

803

status = inode_change_ok(inode, attr);

773

if (status)

804

if (status)

774

return status;

805

return status;

775

806

776

size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;

807

size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;

777

if (size_change) {

808

if (size_change) {

778

status = ocfs2_rw_lock(inode, 1);

809

status = ocfs2_rw_lock(inode, 1);

779

if (status < 0) {

810

if (status < 0) {

780

mlog_errno(status);

811

mlog_errno(status);

781

goto bail;

812

goto bail;

782

}

813

}

783

}

814

}

784

815

785

status = ocfs2_meta_lock(inode, NULL, &bh, 1);

816

status = ocfs2_meta_lock(inode, NULL, &bh, 1);

786

if (status < 0) {

817

if (status < 0) {

787

if (status != -ENOENT)

818

if (status != -ENOENT)

788

mlog_errno(status);

819

mlog_errno(status);

789

goto bail_unlock_rw;

820

goto bail_unlock_rw;

790

}

821

}

791

822

792

if (size_change && attr->ia_size != i_size_read(inode)) {

823

if (size_change && attr->ia_size != i_size_read(inode)) {

793

if (i_size_read(inode) > attr->ia_size)

824

if (i_size_read(inode) > attr->ia_size)

794

status = ocfs2_truncate_file(inode, bh, attr->ia_size);

825

status = ocfs2_truncate_file(inode, bh, attr->ia_size);

795

else

826

else

796

status = ocfs2_extend_file(inode, bh, attr->ia_size);

827

status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);

797

if (status < 0) {

828

if (status < 0) {

798

if (status != -ENOSPC)

829

if (status != -ENOSPC)

799

mlog_errno(status);

830

mlog_errno(status);

800

status = -ENOSPC;

831

status = -ENOSPC;

801

goto bail_unlock;

832

goto bail_unlock;

802

}

833

}

803

}

834

}

804

835

805

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

836

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

806

if (IS_ERR(handle)) {

837

if (IS_ERR(handle)) {

807

status = PTR_ERR(handle);

838

status = PTR_ERR(handle);

808

mlog_errno(status);

839

mlog_errno(status);

809

goto bail_unlock;

840

goto bail_unlock;

810

}

841

}

811

842

812

status = inode_setattr(inode, attr);

843

status = inode_setattr(inode, attr);

813

if (status < 0) {

844

if (status < 0) {

814

mlog_errno(status);

845

mlog_errno(status);

815

goto bail_commit;

846

goto bail_commit;

816

}

847

}

817

848

818

status = ocfs2_mark_inode_dirty(handle, inode, bh);

849

status = ocfs2_mark_inode_dirty(handle, inode, bh);

819

if (status < 0)

850

if (status < 0)

820

mlog_errno(status);

851

mlog_errno(status);

821

852

822

bail_commit:

853

bail_commit:

823

ocfs2_commit_trans(handle);

854

ocfs2_commit_trans(handle);

824

bail_unlock:

855

bail_unlock:

825

ocfs2_meta_unlock(inode, 1);

856

ocfs2_meta_unlock(inode, 1);

826

bail_unlock_rw:

857

bail_unlock_rw:

827

if (size_change)

858

if (size_change)

828

ocfs2_rw_unlock(inode, 1);

859

ocfs2_rw_unlock(inode, 1);

829

bail:

860

bail:

830

if (bh)

861

if (bh)

831

brelse(bh);

862

brelse(bh);

832

863

833

mlog_exit(status);

864

mlog_exit(status);

834

return status;

865

return status;

835

}

866

}

836

867

837

int ocfs2_getattr(struct vfsmount *mnt,

868

int ocfs2_getattr(struct vfsmount *mnt,

838

struct dentry *dentry,

869

struct dentry *dentry,

839

struct kstat *stat)

870

struct kstat *stat)

840

{

871

{

841

struct inode *inode = dentry->d_inode;

872

struct inode *inode = dentry->d_inode;

842

struct super_block *sb = dentry->d_inode->i_sb;

873

struct super_block *sb = dentry->d_inode->i_sb;

843

struct ocfs2_super *osb = sb->s_fs_info;

874

struct ocfs2_super *osb = sb->s_fs_info;

844

int err;

875

int err;

845

876

846

mlog_entry_void();

877

mlog_entry_void();

847

878

848

err = ocfs2_inode_revalidate(dentry);

879

err = ocfs2_inode_revalidate(dentry);

849

if (err) {

880

if (err) {

850

if (err != -ENOENT)

881

if (err != -ENOENT)

851

mlog_errno(err);

882

mlog_errno(err);

852

goto bail;

883

goto bail;

853

}

884

}

854

885

855

generic_fillattr(inode, stat);

886

generic_fillattr(inode, stat);

856

887

857

/* We set the blksize from the cluster size for performance */

888

/* We set the blksize from the cluster size for performance */

858

stat->blksize = osb->s_clustersize;

889

stat->blksize = osb->s_clustersize;

859

890

860

bail:

891

bail:

861

mlog_exit(err);

892

mlog_exit(err);

862

893

863

return err;

894

return err;

864

}

895

}

865

896

866

static int ocfs2_write_remove_suid(struct inode *inode)

897

static int ocfs2_write_remove_suid(struct inode *inode)

867

{

898

{

868

int ret;

899

int ret;

869

struct buffer_head *bh = NULL;

900

struct buffer_head *bh = NULL;

870

struct ocfs2_inode_info *oi = OCFS2_I(inode);

901

struct ocfs2_inode_info *oi = OCFS2_I(inode);

871

struct ocfs2_journal_handle *handle;

902

struct ocfs2_journal_handle *handle;

872

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

903

struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

873

struct ocfs2_dinode *di;

904

struct ocfs2_dinode *di;

874

905

875

mlog_entry("(Inode %llu, mode 0%o)\n",

906

mlog_entry("(Inode %llu, mode 0%o)\n",

876

(unsigned long long)oi->ip_blkno, inode->i_mode);

907

(unsigned long long)oi->ip_blkno, inode->i_mode);

877

908

878

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

909

handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);

879

if (handle == NULL) {

910

if (handle == NULL) {

880

ret = -ENOMEM;

911

ret = -ENOMEM;

881

mlog_errno(ret);

912

mlog_errno(ret);

882

goto out;

913

goto out;

883

}

914

}

884

915

885

ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);

916

ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);

886

if (ret < 0) {

917

if (ret < 0) {

887

mlog_errno(ret);

918

mlog_errno(ret);

888

goto out_trans;

919

goto out_trans;

889

}

920

}

890

921

891

ret = ocfs2_journal_access(handle, inode, bh,

922

ret = ocfs2_journal_access(handle, inode, bh,

892

OCFS2_JOURNAL_ACCESS_WRITE);

923

OCFS2_JOURNAL_ACCESS_WRITE);

893

if (ret < 0) {

924

if (ret < 0) {

894

mlog_errno(ret);

925

mlog_errno(ret);

895

goto out_bh;

926

goto out_bh;

896

}

927

}

897

928

898

inode->i_mode &= ~S_ISUID;

929

inode->i_mode &= ~S_ISUID;

899

if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))

930

if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))

900

inode->i_mode &= ~S_ISGID;

931

inode->i_mode &= ~S_ISGID;

901

932

902

di = (struct ocfs2_dinode *) bh->b_data;

933

di = (struct ocfs2_dinode *) bh->b_data;

903

di->i_mode = cpu_to_le16(inode->i_mode);

934

di->i_mode = cpu_to_le16(inode->i_mode);

904

935

905

ret = ocfs2_journal_dirty(handle, bh);

936

ret = ocfs2_journal_dirty(handle, bh);

906

if (ret < 0)

937

if (ret < 0)

907

mlog_errno(ret);

938

mlog_errno(ret);

908

out_bh:

939

out_bh:

909

brelse(bh);

940

brelse(bh);

910

out_trans:

941

out_trans:

911

ocfs2_commit_trans(handle);

942

ocfs2_commit_trans(handle);

912

out:

943

out:

913

mlog_exit(ret);

944

mlog_exit(ret);

914

return ret;

945

return ret;

915

}

946

}

916

947

917

static inline int ocfs2_write_should_remove_suid(struct inode *inode)

948

static inline int ocfs2_write_should_remove_suid(struct inode *inode)

918

{

949

{

919

mode_t mode = inode->i_mode;

950

mode_t mode = inode->i_mode;

920

951

921

if (!capable(CAP_FSETID)) {

952

if (!capable(CAP_FSETID)) {

922

if (unlikely(mode & S_ISUID))

953

if (unlikely(mode & S_ISUID))

923

return 1;

954

return 1;

924

955

925

if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))

956

if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))

926

return 1;

957

return 1;

927

}

958

}

928

return 0;

959

return 0;

929

}

960

}

930

961

931

static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,

962

static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,

932

const char __user *buf,

963

const char __user *buf,

933

size_t count,

964

size_t count,

934

loff_t pos)

965

loff_t pos)

935

{

966

{

936

struct iovec local_iov = { .iov_base = (void __user *)buf,

967

struct iovec local_iov = { .iov_base = (void __user *)buf,

937

.iov_len = count };

968

.iov_len = count };

938

int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;

969

int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;

939

u32 clusters;

970

u32 clusters;

940

struct file *filp = iocb->ki_filp;

971

struct file *filp = iocb->ki_filp;

941

struct inode *inode = filp->f_dentry->d_inode;

972

struct inode *inode = filp->f_dentry->d_inode;

942

loff_t newsize, saved_pos;

973

loff_t newsize, saved_pos;

943

974

944

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,

975

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,

945

(unsigned int)count,

976

(unsigned int)count,

946

filp->f_dentry->d_name.len,

977

filp->f_dentry->d_name.len,

947

filp->f_dentry->d_name.name);

978

filp->f_dentry->d_name.name);

948

979

949

/* happy write of zero bytes */

980

/* happy write of zero bytes */

950

if (count == 0)

981

if (count == 0)

951

return 0;

982

return 0;

952

983

953

if (!inode) {

984

if (!inode) {

954

mlog(0, "bad inode\n");

985

mlog(0, "bad inode\n");

955

return -EIO;

986

return -EIO;

956

}

987

}

957

988

958

mutex_lock(&inode->i_mutex);

989

mutex_lock(&inode->i_mutex);

959

/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */

990

/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */

960

if (filp->f_flags & O_DIRECT) {

991

if (filp->f_flags & O_DIRECT) {

961

have_alloc_sem = 1;

992

have_alloc_sem = 1;

962

down_read(&inode->i_alloc_sem);

993

down_read(&inode->i_alloc_sem);

963

}

994

}

964

995

965

/* concurrent O_DIRECT writes are allowed */

996

/* concurrent O_DIRECT writes are allowed */

966

rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;

997

rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;

967

ret = ocfs2_rw_lock(inode, rw_level);

998

ret = ocfs2_rw_lock(inode, rw_level);

968

if (ret < 0) {

999

if (ret < 0) {

969

rw_level = -1;

1000

rw_level = -1;

970

mlog_errno(ret);

1001

mlog_errno(ret);

971

goto out;

1002

goto out;

972

}

1003

}

973

1004

974

/*

1005

/*

975

* We sample i_size under a read level meta lock to see if our write

1006

* We sample i_size under a read level meta lock to see if our write

976

* is extending the file, if it is we back off and get a write level

1007

* is extending the file, if it is we back off and get a write level

977

* meta lock.

1008

* meta lock.

978

*/

1009

*/

979

meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;

1010

meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;

980

for(;;) {

1011

for(;;) {

981

ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);

1012

ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);

982

if (ret < 0) {

1013

if (ret < 0) {

983

meta_level = -1;

1014

meta_level = -1;

984

mlog_errno(ret);

1015

mlog_errno(ret);

985

goto out;

1016

goto out;

986

}

1017

}

987

1018

988

/* Clear suid / sgid if necessary. We do this here

1019

/* Clear suid / sgid if necessary. We do this here

989

* instead of later in the write path because

1020

* instead of later in the write path because

990

* remove_suid() calls ->setattr without any hint that

1021

* remove_suid() calls ->setattr without any hint that

991

* we may have already done our cluster locking. Since

1022

* we may have already done our cluster locking. Since

992

* ocfs2_setattr() *must* take cluster locks to

1023

* ocfs2_setattr() *must* take cluster locks to

993

* proceeed, this will lead us to recursively lock the

1024

* proceeed, this will lead us to recursively lock the

994

* inode. There's also the dinode i_size state which

1025

* inode. There's also the dinode i_size state which

995

* can be lost via setattr during extending writes (we

1026

* can be lost via setattr during extending writes (we

996

* set inode->i_size at the end of a write. */

1027

* set inode->i_size at the end of a write. */

997

if (ocfs2_write_should_remove_suid(inode)) {

1028

if (ocfs2_write_should_remove_suid(inode)) {

998

if (meta_level == 0) {

1029

if (meta_level == 0) {

999

ocfs2_meta_unlock(inode, meta_level);

1030

ocfs2_meta_unlock(inode, meta_level);

1000

meta_level = 1;

1031

meta_level = 1;

1001

continue;

1032

continue;

1002

}

1033

}

1003

1034

1004

ret = ocfs2_write_remove_suid(inode);

1035

ret = ocfs2_write_remove_suid(inode);

1005

if (ret < 0) {

1036

if (ret < 0) {

1006

mlog_errno(ret);

1037

mlog_errno(ret);

1007

goto out;

1038

goto out;

1008

}

1039

}

1009

}

1040

}

1010

1041

1011

/* work on a copy of ppos until we're sure that we won't have

1042

/* work on a copy of ppos until we're sure that we won't have

1012

* to recalculate it due to relocking. */

1043

* to recalculate it due to relocking. */

1013

if (filp->f_flags & O_APPEND) {

1044

if (filp->f_flags & O_APPEND) {

1014

saved_pos = i_size_read(inode);

1045

saved_pos = i_size_read(inode);

1015

mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);

1046

mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);

1016

} else {

1047

} else {

1017

saved_pos = iocb->ki_pos;

1048

saved_pos = iocb->ki_pos;

1018

}

1049

}

1019

newsize = count + saved_pos;

1050

newsize = count + saved_pos;

1020

1051

1021

mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",

1052

mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",

1022

(long long) saved_pos, (long long) newsize,

1053

(long long) saved_pos, (long long) newsize,

1023

(long long) i_size_read(inode));

1054

(long long) i_size_read(inode));

1024

1055

1025

/* No need for a higher level metadata lock if we're

1056

/* No need for a higher level metadata lock if we're

1026

* never going past i_size. */

1057

* never going past i_size. */

1027

if (newsize <= i_size_read(inode))

1058

if (newsize <= i_size_read(inode))

1028

break;

1059

break;

1029

1060

1030

if (meta_level == 0) {

1061

if (meta_level == 0) {

1031

ocfs2_meta_unlock(inode, meta_level);

1062

ocfs2_meta_unlock(inode, meta_level);

1032

meta_level = 1;

1063

meta_level = 1;

1033

continue;

1064

continue;

1034

}

1065

}

1035

1066

1036

spin_lock(&OCFS2_I(inode)->ip_lock);

1067

spin_lock(&OCFS2_I(inode)->ip_lock);

1037

clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -

1068

clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -

1038

OCFS2_I(inode)->ip_clusters;

1069

OCFS2_I(inode)->ip_clusters;

1039

spin_unlock(&OCFS2_I(inode)->ip_lock);

1070

spin_unlock(&OCFS2_I(inode)->ip_lock);

1040

1071

1041

mlog(0, "Writing at EOF, may need more allocation: "

1072

mlog(0, "Writing at EOF, may need more allocation: "

1042

"i_size = %lld, newsize = %lld, need %u clusters\n",

1073

"i_size = %lld, newsize = %lld, need %u clusters\n",

1043

(long long) i_size_read(inode), (long long) newsize,

1074

(long long) i_size_read(inode), (long long) newsize,

1044

clusters);

1075

clusters);

1045

1076

1046

/* We only want to continue the rest of this loop if

1077

/* We only want to continue the rest of this loop if

1047

* our extend will actually require more

1078

* our extend will actually require more

1048

* allocation. */

1079

* allocation. */

1049

if (!clusters)

1080

if (!clusters)

1050

break;

1081

break;

1051

1082

1052

ret = ocfs2_extend_allocation(inode, clusters);

1083

ret = ocfs2_extend_file(inode, NULL, newsize, count);

1053

if (ret < 0) {

1084

if (ret < 0) {

1054

if (ret != -ENOSPC)

1085

if (ret != -ENOSPC)

1055

mlog_errno(ret);

1086

mlog_errno(ret);

1056

goto out;

1057

}

1058

1059

/* Fill any holes which would've been created by this

1060

* write. If we're O_APPEND, this will wind up

1061

* (correctly) being a noop. */

1062

ret = ocfs2_zero_extend(inode, (u64) newsize - count);

1063

if (ret < 0) {

1064

mlog_errno(ret);

1065

goto out;

1087

goto out;

1066

}

1088

}

1067

break;

1089

break;

1068

}

1090

}

1069

1091

1070

/* ok, we're done with i_size and alloc work */

1092

/* ok, we're done with i_size and alloc work */

1071

iocb->ki_pos = saved_pos;

1093

iocb->ki_pos = saved_pos;

1072

ocfs2_meta_unlock(inode, meta_level);

1094

ocfs2_meta_unlock(inode, meta_level);

1073

meta_level = -1;

1095

meta_level = -1;

1074

1096

1075

/* communicate with ocfs2_dio_end_io */

1097

/* communicate with ocfs2_dio_end_io */

1076

ocfs2_iocb_set_rw_locked(iocb);

1098

ocfs2_iocb_set_rw_locked(iocb);

1077

1099

1078

ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);

1100

ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);

1079

1101

1080

/* buffered aio wouldn't have proper lock coverage today */

1102

/* buffered aio wouldn't have proper lock coverage today */

1081

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

1103

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

1082

1104

1083

/*

1105

/*

1084

* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io

1106

* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io

1085

* function pointer which is called when o_direct io completes so that

1107

* function pointer which is called when o_direct io completes so that

1086

* it can unlock our rw lock. (it's the clustered equivalent of

1108

* it can unlock our rw lock. (it's the clustered equivalent of

1087

* i_alloc_sem; protects truncate from racing with pending ios).

1109

* i_alloc_sem; protects truncate from racing with pending ios).

1088

* Unfortunately there are error cases which call end_io and others

1110

* Unfortunately there are error cases which call end_io and others

1089

* that don't. so we don't have to unlock the rw_lock if either an

1111

* that don't. so we don't have to unlock the rw_lock if either an

1090

* async dio is going to do it in the future or an end_io after an

1112

* async dio is going to do it in the future or an end_io after an

1091

* error has already done it.

1113

* error has already done it.

1092

*/

1114

*/

1093

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1115

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1094

rw_level = -1;

1116

rw_level = -1;

1095

have_alloc_sem = 0;

1117

have_alloc_sem = 0;

1096

}

1118

}

1097

1119

1098

out:

1120

out:

1099

if (meta_level != -1)

1121

if (meta_level != -1)

1100

ocfs2_meta_unlock(inode, meta_level);

1122

ocfs2_meta_unlock(inode, meta_level);

1101

if (have_alloc_sem)

1123

if (have_alloc_sem)

1102

up_read(&inode->i_alloc_sem);

1124

up_read(&inode->i_alloc_sem);

1103

if (rw_level != -1)

1125

if (rw_level != -1)

1104

ocfs2_rw_unlock(inode, rw_level);

1126

ocfs2_rw_unlock(inode, rw_level);

1105

mutex_unlock(&inode->i_mutex);

1127

mutex_unlock(&inode->i_mutex);

1106

1128

1107

mlog_exit(ret);

1129

mlog_exit(ret);

1108

return ret;

1130

return ret;

1109

}

1131

}

1110

1132

1111

static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,

1133

static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,

1112

char __user *buf,

1134

char __user *buf,

1113

size_t count,

1135

size_t count,

1114

loff_t pos)

1136

loff_t pos)

1115

{

1137

{

1116

int ret = 0, rw_level = -1, have_alloc_sem = 0;

1138

int ret = 0, rw_level = -1, have_alloc_sem = 0;

1117

struct file *filp = iocb->ki_filp;

1139

struct file *filp = iocb->ki_filp;

1118

struct inode *inode = filp->f_dentry->d_inode;

1140

struct inode *inode = filp->f_dentry->d_inode;

1119

1141

1120

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,

1142

mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,

1121

(unsigned int)count,

1143

(unsigned int)count,

1122

filp->f_dentry->d_name.len,

1144

filp->f_dentry->d_name.len,

1123

filp->f_dentry->d_name.name);

1145

filp->f_dentry->d_name.name);

1124

1146

1125

if (!inode) {

1147

if (!inode) {

1126

ret = -EINVAL;

1148

ret = -EINVAL;

1127

mlog_errno(ret);

1149

mlog_errno(ret);

1128

goto bail;

1150

goto bail;

1129

}

1151

}

1130

1152

1131

/*

1153

/*

1132

* buffered reads protect themselves in ->readpage(). O_DIRECT reads

1154

* buffered reads protect themselves in ->readpage(). O_DIRECT reads

1133

* need locks to protect pending reads from racing with truncate.

1155

* need locks to protect pending reads from racing with truncate.

1134

*/

1156

*/

1135

if (filp->f_flags & O_DIRECT) {

1157

if (filp->f_flags & O_DIRECT) {

1136

down_read(&inode->i_alloc_sem);

1158

down_read(&inode->i_alloc_sem);

1137

have_alloc_sem = 1;

1159

have_alloc_sem = 1;

1138

1160

1139

ret = ocfs2_rw_lock(inode, 0);

1161

ret = ocfs2_rw_lock(inode, 0);

1140

if (ret < 0) {

1162

if (ret < 0) {

1141

mlog_errno(ret);

1163

mlog_errno(ret);

1142

goto bail;

1164

goto bail;

1143

}

1165

}

1144

rw_level = 0;

1166

rw_level = 0;

1145

/* communicate with ocfs2_dio_end_io */

1167

/* communicate with ocfs2_dio_end_io */

1146

ocfs2_iocb_set_rw_locked(iocb);

1168

ocfs2_iocb_set_rw_locked(iocb);

1147

}

1169

}

1148

1170

1149

ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);

1171

ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);

1150

if (ret == -EINVAL)

1172

if (ret == -EINVAL)

1151

mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");

1173

mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");

1152

1174

1153

/* buffered aio wouldn't have proper lock coverage today */

1175

/* buffered aio wouldn't have proper lock coverage today */

1154

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

1176

BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));

1155

1177

1156

/* see ocfs2_file_aio_write */

1178

/* see ocfs2_file_aio_write */

1157

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1179

if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {

1158

rw_level = -1;

1180

rw_level = -1;

1159

have_alloc_sem = 0;

1181

have_alloc_sem = 0;

1160

}

1182

}

1161

1183

1162

bail:

1184

bail:

1163

if (have_alloc_sem)

1185

if (have_alloc_sem)

1164

up_read(&inode->i_alloc_sem);

1186

up_read(&inode->i_alloc_sem);

1165

if (rw_level != -1)

1187

if (rw_level != -1)

1166

ocfs2_rw_unlock(inode, rw_level);

1188

ocfs2_rw_unlock(inode, rw_level);

1167

mlog_exit(ret);

1189

mlog_exit(ret);

1168

1190

1169

return ret;

1191

return ret;

1170

}

1192

}

1171

1193

1172

struct inode_operations ocfs2_file_iops = {

1194

struct inode_operations ocfs2_file_iops = {

1173

.setattr = ocfs2_setattr,

1195

.setattr = ocfs2_setattr,

1174

.getattr = ocfs2_getattr,

1196

.getattr = ocfs2_getattr,

1175

};

1197

};

1176

1198

1177

struct inode_operations ocfs2_special_file_iops = {

1199

struct inode_operations ocfs2_special_file_iops = {

1178

.setattr = ocfs2_setattr,

1200

.setattr = ocfs2_setattr,

1179

.getattr = ocfs2_getattr,

1201

.getattr = ocfs2_getattr,

1180

};

1202

};

1181

1203

1182

const struct file_operations ocfs2_fops = {

1204

const struct file_operations ocfs2_fops = {

1183

.read = do_sync_read,

1205

.read = do_sync_read,

1184

.write = do_sync_write,

1206

.write = do_sync_write,

1185

.sendfile = generic_file_sendfile,

1207

.sendfile = generic_file_sendfile,

1186

.mmap = ocfs2_mmap,

1208

.mmap = ocfs2_mmap,

1187

.fsync = ocfs2_sync_file,

1209

.fsync = ocfs2_sync_file,

1188

.release = ocfs2_file_release,

1210

.release = ocfs2_file_release,

GITLAB

ocfs2: take data locks around extend

 /* -*- mode: c; c-basic-offset: 8; -*-
  * vim: noexpandtab sw=8 ts=8 sts=0:
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
 #include "super.h"
 #include "symlink.h"
 #include "buffer_head_io.h"
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
 	int status;
 	struct ocfs2_dinode *fe = NULL;
 	struct buffer_head *bh = NULL;
 	struct buffer_head *buffer_cache_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
 		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
 		     (unsigned long long)iblock);
 		goto bail;
 	}
 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
 				  OCFS2_I(inode)->ip_blkno,
 				  &bh, OCFS2_BH_CACHED, inode);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
 		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
 		goto bail;
 	}
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		     "%llu\n", (unsigned long long)iblock);
 		goto bail;
 	}
 	/* We don't use the page cache to create symlink data, so if
 	 * need be, copy it over from the buffer cache. */
 	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
 		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
 			    iblock;
 		buffer_cache_bh = sb_getblk(osb->sb, blkno);
 		if (!buffer_cache_bh) {
 			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 			goto bail;
 		}
 		/* we haven't locked out transactions, so a commit
 		 * could've happened. Since we've got a reference on
 		 * the bh, even if it commits while we're doing the
 		 * copy, the data is still good. */
 		if (buffer_jbd(buffer_cache_bh)
 		    && ocfs2_inode_is_new(inode)) {
 			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 			if (!kaddr) {
 				mlog(ML_ERROR, "couldn't kmap!\n");
 				goto bail;
 			}
 			memcpy(kaddr + (bh_result->b_size * iblock),
 			       buffer_cache_bh->b_data,
 			       bh_result->b_size);
 			kunmap_atomic(kaddr, KM_USER0);
 			set_buffer_uptodate(bh_result);
 		}
 		brelse(buffer_cache_bh);
 	}
 	map_bh(bh_result, inode->i_sb,
 	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
 	err = 0;
 bail:
 	if (bh)
 		brelse(bh);
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
 	int err = 0;
 	u64 p_blkno, past_eof;
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 		     inode, inode->i_ino);
 	if (S_ISLNK(inode->i_mode)) {
 		/* this always does I/O for some reason. */
 		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
 		goto bail;
 	}
 	/* this can happen if another node truncs after our extend! */
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 					       OCFS2_I(inode)->ip_clusters))
 		err = -EIO;
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	if (err)
 		goto bail;
 	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
 					  NULL);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
 		     (unsigned long long)p_blkno);
 		goto bail;
 	}
 	map_bh(bh_result, inode->i_sb, p_blkno);
 	if (bh_result->b_blocknr == 0) {
 		err = -EIO;
 		mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
 		     (unsigned long long)iblock,
 		     (unsigned long long)p_blkno,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	}
 	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
 	     (unsigned long long)past_eof);
 	if (create && (iblock >= past_eof))
 		set_buffer_new(bh_result);
 bail:
 	if (err < 0)
 		err = -EIO;
 	mlog_exit(err);
 	return err;
 }
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out;
 	}
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	/*
 	 * i_size might have just been updated as we grabed the meta lock.  We
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
 	 * (generic_file_read, fault->nopage) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
 		char *addr = kmap(page);
 		memset(addr, 0, PAGE_SIZE);
 		flush_dcache_page(page);
 		kunmap(page);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
 	}
 	ret = ocfs2_data_lock_with_page(inode, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
 		mlog_errno(ret);
 		goto out_alloc;
 	}
 	ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 0);
 out:
 	if (unlock)
 		unlock_page(page);
 	mlog_exit(ret);
 	return ret;
 }
 /* Note: Because we don't support holes, our allocation has
  * already happened (allocation writes zeros to the file data)
  * so we don't have to worry about ordered writes in
  * ocfs2_writepage.
  *
  * ->writepage is called during the process of invalidating the page cache
  * during blocked lock processing.  It can't block on any cluster locks
  * to during block mapping.  It's relying on the fact that the block
  * mapping can't have disappeared under the dirty pages that it is
  * being asked to write back.
  */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 	mlog_entry("(0x%p)\n", page);
 	ret = block_write_full_page(page, ocfs2_get_block, wbc);
 	mlog_exit(ret);
 	return ret;
 }
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+			       unsigned from, unsigned to)
+{
+	int ret;
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	return ret;
+}
 /*
  * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
  * from loopback.  It must be able to perform its own locking around
  * ocfs2_get_block().
  */
-int ocfs2_prepare_write(struct file *file, struct page *page,
+static int ocfs2_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to)
+			       unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
 	if (ret != 0) {
 		mlog_errno(ret);
 		goto out;
 	}
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-	ret = block_prepare_write(page, from, to, ocfs2_get_block);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 0);
 out:
 	mlog_exit(ret);
 	return ret;
 }
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
 static int walk_page_buffers(	handle_t *handle,
 				struct buffer_head *head,
 				unsigned from,
 				unsigned to,
 				int *partial,
 				int (*fn)(	handle_t *handle,
 						struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
 	    	block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
 							 unsigned to)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_journal_handle *handle = NULL;
 	int ret = 0;
 	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
 	if (!handle) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	if (ocfs2_should_order_data(inode)) {
 		ret = walk_page_buffers(handle->k_handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
 		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
 	if (ret) {
 		if (handle)
 			ocfs2_commit_trans(handle);
 		handle = ERR_PTR(ret);
 	}
 	return handle;
 }
 static int ocfs2_commit_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
 	int ret, extending = 0, locklevel = 0;
 	loff_t new_i_size;
 	struct buffer_head *di_bh = NULL;
 	struct inode *inode = page->mapping->host;
 	struct ocfs2_journal_handle *handle = NULL;
 	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
 	 * us to sample inode->i_size here without the metadata lock:
 	 *
 	 * 1) We're currently holding the inode alloc lock, so no
 	 *    nodes can change it underneath us.
 	 *
 	 * 2) We've had to take the metadata lock at least once
 	 *    already to check for extending writes, hence insuring
 	 *    that our current copy is also up to date.
 	 */
 	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 	if (new_i_size > i_size_read(inode)) {
 		extending = 1;
 		locklevel = 1;
 	}
 	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
 	if (ret != 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_data_lock_with_page(inode, 1, page);
 	if (ret != 0) {
 		mlog_errno(ret);
 		goto out_unlock_meta;
 	}
 	if (extending) {
 		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			handle = NULL;
 			goto out_unlock_data;
 		}
 		/* Mark our buffer early. We'd rather catch this error up here
 		 * as opposed to after a successful commit_write which would
 		 * require us to set back inode->i_size. */
 		ret = ocfs2_journal_access(handle, inode, di_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_commit;
 		}
 	}
 	/* might update i_size */
 	ret = generic_commit_write(file, page, from, to);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 	if (extending) {
 		loff_t size = (u64) i_size_read(inode);
 		struct ocfs2_dinode *di =
 			(struct ocfs2_dinode *)di_bh->b_data;
 		/* ocfs2_mark_inode_dirty is too heavy to use here. */
 		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
 		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 		di->i_size = cpu_to_le64(size);
 		di->i_ctime = di->i_mtime =
 				cpu_to_le64(inode->i_mtime.tv_sec);
 		di->i_ctime_nsec = di->i_mtime_nsec =
 				cpu_to_le32(inode->i_mtime.tv_nsec);
 		ret = ocfs2_journal_dirty(handle, di_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_commit;
 		}
 	}
 	BUG_ON(extending && (i_size_read(inode) != new_i_size));
 out_commit:
 	if (handle)
 		ocfs2_commit_trans(handle);
 out_unlock_data:
 	ocfs2_data_unlock(inode, 1);
 out_unlock_meta:
 	ocfs2_meta_unlock(inode, locklevel);
 out:
 	if (di_bh)
 		brelse(di_bh);
 	mlog_exit(ret);
 	return ret;
 }
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
 	u64 p_blkno = 0;
 	int err = 0;
 	struct inode *inode = mapping->host;
 	mlog_entry("(block = %llu)\n", (unsigned long long)block);
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
 	 */
 	if (!INODE_JOURNAL(inode)) {
 		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
 		if (err) {
 			if (err != -ENOENT)
 				mlog_errno(err);
 			goto bail;
 		}
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
 					  NULL);
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 		ocfs2_meta_unlock(inode, 0);
 	}
 	if (err) {
 		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
 		     (unsigned long long)block);
 		mlog_errno(err);
 		goto bail;
 	}
 bail:
 	status = err ? 0 : p_blkno;
 	mlog_exit((int)status);
 	return status;
 }
 /*
  * TODO: Make this into a generic get_blocks function.
  *
  * From do_direct_io in direct-io.c:
  *  "So what we do is to permit the ->get_blocks function to populate
  *   bh.b_size with the size of IO which is permitted at this offset and
  *   this i_blkbits."
  *
  * This function is called directly from get_more_blocks in direct-io.c.
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
 	u64 vbo_max; /* file offset, max_blocks from iblock */
 	u64 p_blkno;
 	int contig_blocks;
 	unsigned char blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 	if (!inode || !bh_result) {
 		mlog(ML_ERROR, "inode or bh_result is null\n");
 		return -EIO;
 	}
 	blocksize_bits = inode->i_sb->s_blocksize_bits;
 	/* This function won't even be called if the request isn't all
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
 	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	if ((iblock + max_blocks) >
 	    ocfs2_clusters_to_blocks(inode->i_sb,
 				     OCFS2_I(inode)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
 		ret = -EIO;
 		goto bail;
 	}
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
 					  &contig_blocks);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
 		ret = -EIO;
 		goto bail;
 	}
 	map_bh(bh_result, inode->i_sb, p_blkno);
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
 	if (max_blocks < contig_blocks)
 		contig_blocks = max_blocks;
 	bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
 	return ret;
 }
 /*
  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
  * particularly interested in the aio/dio case.  Like the core uses
  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
  * truncation on another.
  */
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
 			     void *private)
 {
 	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 	ocfs2_iocb_clear_rw_locked(iocb);
 	up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, 0);
 }
 static ssize_t ocfs2_direct_IO(int rw,
 			       struct kiocb *iocb,
 			       const struct iovec *iov,
 			       loff_t offset,
 			       unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 	int ret;
 	mlog_entry_void();
+	/*
+	 * We get PR data locks even for O_DIRECT.  This allows
+	 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+	 * extending and buffered zeroing writes race.  If they did
+	 * race then the buffered zeroing could be written back after
+	 * the O_DIRECT I/O.  It's one thing to tell people not to mix
+	 * buffered and O_DIRECT writes, but expecting them to
+	 * understand that file extension is also an implicit buffered
+	 * write is too much.  By getting the PR we force writeback of
+	 * the buffered zeroing before proceeding.
+	 */
+	ret = ocfs2_data_lock(inode, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_data_unlock(inode, 0);
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs,
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
+out:
 	mlog_exit(ret);
 	return ret;
 }
 struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.prepare_write	= ocfs2_prepare_write,
 	.commit_write	= ocfs2_commit_write,
 	.bmap		= ocfs2_bmap,