Doug / smarc-fsl-linux-kernel

1

#include <linux/ceph/ceph_debug.h>

1

#include <linux/ceph/ceph_debug.h>

2

3

#include <linux/backing-dev.h>

3

#include <linux/backing-dev.h>

4

#include <linux/fs.h>

4

#include <linux/fs.h>

5

#include <linux/mm.h>

5

#include <linux/mm.h>

6

#include <linux/pagemap.h>

6

#include <linux/pagemap.h>

7

#include <linux/writeback.h> /* generic_writepages */

7

#include <linux/writeback.h> /* generic_writepages */

8

#include <linux/slab.h>

8

#include <linux/slab.h>

9

#include <linux/pagevec.h>

9

#include <linux/pagevec.h>

10

#include <linux/task_io_accounting_ops.h>

10

#include <linux/task_io_accounting_ops.h>

11

12

#include "super.h"

12

#include "super.h"

13

#include "mds_client.h"

13

#include "mds_client.h"

14

#include "cache.h"

14

#include "cache.h"

15

#include <linux/ceph/osd_client.h>

15

#include <linux/ceph/osd_client.h>

16

17

/*

17

/*

18

* Ceph address space ops.

18

* Ceph address space ops.

19

*

19

*

20

* There are a few funny things going on here.

20

* There are a few funny things going on here.

21

*

21

*

22

* The page->private field is used to reference a struct

22

* The page->private field is used to reference a struct

23

* ceph_snap_context for _every_ dirty page. This indicates which

23

* ceph_snap_context for _every_ dirty page. This indicates which

24

* snapshot the page was logically dirtied in, and thus which snap

24

* snapshot the page was logically dirtied in, and thus which snap

25

* context needs to be associated with the osd write during writeback.

25

* context needs to be associated with the osd write during writeback.

26

*

26

*

27

* Similarly, struct ceph_inode_info maintains a set of counters to

27

* Similarly, struct ceph_inode_info maintains a set of counters to

28

* count dirty pages on the inode. In the absence of snapshots,

28

* count dirty pages on the inode. In the absence of snapshots,

29

* i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.

29

* i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.

30

*

30

*

31

* When a snapshot is taken (that is, when the client receives

31

* When a snapshot is taken (that is, when the client receives

32

* notification that a snapshot was taken), each inode with caps and

32

* notification that a snapshot was taken), each inode with caps and

33

* with dirty pages (dirty pages implies there is a cap) gets a new

33

* with dirty pages (dirty pages implies there is a cap) gets a new

34

* ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending

34

* ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending

35

* order, new snaps go to the tail). The i_wrbuffer_ref_head count is

35

* order, new snaps go to the tail). The i_wrbuffer_ref_head count is

36

* moved to capsnap->dirty. (Unless a sync write is currently in

36

* moved to capsnap->dirty. (Unless a sync write is currently in

37

* progress. In that case, the capsnap is said to be "pending", new

37

* progress. In that case, the capsnap is said to be "pending", new

38

* writes cannot start, and the capsnap isn't "finalized" until the

38

* writes cannot start, and the capsnap isn't "finalized" until the

39

* write completes (or fails) and a final size/mtime for the inode for

39

* write completes (or fails) and a final size/mtime for the inode for

40

* that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.

40

* that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.

41

*

41

*

42

* On writeback, we must submit writes to the osd IN SNAP ORDER. So,

42

* On writeback, we must submit writes to the osd IN SNAP ORDER. So,

43

* we look for the first capsnap in i_cap_snaps and write out pages in

43

* we look for the first capsnap in i_cap_snaps and write out pages in

44

* that snap context _only_. Then we move on to the next capsnap,

44

* that snap context _only_. Then we move on to the next capsnap,

45

* eventually reaching the "live" or "head" context (i.e., pages that

45

* eventually reaching the "live" or "head" context (i.e., pages that

46

* are not yet snapped) and are writing the most recently dirtied

46

* are not yet snapped) and are writing the most recently dirtied

47

* pages.

47

* pages.

48

*

48

*

49

* Invalidate and so forth must take care to ensure the dirty page

49

* Invalidate and so forth must take care to ensure the dirty page

50

* accounting is preserved.

50

* accounting is preserved.

51

*/

51

*/

52

53

#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))

53

#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))

54

#define CONGESTION_OFF_THRESH(congestion_kb) \

54

#define CONGESTION_OFF_THRESH(congestion_kb) \

55

(CONGESTION_ON_THRESH(congestion_kb) - \

55

(CONGESTION_ON_THRESH(congestion_kb) - \

56

(CONGESTION_ON_THRESH(congestion_kb) >> 2))

56

(CONGESTION_ON_THRESH(congestion_kb) >> 2))

57

58

static inline struct ceph_snap_context *page_snap_context(struct page *page)

58

static inline struct ceph_snap_context *page_snap_context(struct page *page)

59

{

59

{

60

if (PagePrivate(page))

60

if (PagePrivate(page))

61

return (void *)page->private;

61

return (void *)page->private;

62

return NULL;

62

return NULL;

63

}

63

}

64

65

/*

65

/*

66

* Dirty a page. Optimistically adjust accounting, on the assumption

66

* Dirty a page. Optimistically adjust accounting, on the assumption

67

* that we won't race with invalidate. If we do, readjust.

67

* that we won't race with invalidate. If we do, readjust.

68

*/

68

*/

69

static int ceph_set_page_dirty(struct page *page)

69

static int ceph_set_page_dirty(struct page *page)

70

{

70

{

71

struct address_space *mapping = page->mapping;

71

struct address_space *mapping = page->mapping;

72

struct inode *inode;

72

struct inode *inode;

73

struct ceph_inode_info *ci;

73

struct ceph_inode_info *ci;

74

struct ceph_snap_context *snapc;

74

struct ceph_snap_context *snapc;

75

int ret;

75

int ret;

76

77

if (unlikely(!mapping))

77

if (unlikely(!mapping))

78

return !TestSetPageDirty(page);

78

return !TestSetPageDirty(page);

79

80

if (PageDirty(page)) {

80

if (PageDirty(page)) {

81

dout("%p set_page_dirty %p idx %lu -- already dirty\n",

81

dout("%p set_page_dirty %p idx %lu -- already dirty\n",

82

mapping->host, page, page->index);

82

mapping->host, page, page->index);

83

BUG_ON(!PagePrivate(page));

83

BUG_ON(!PagePrivate(page));

84

return 0;

84

return 0;

85

}

85

}

86

87

inode = mapping->host;

87

inode = mapping->host;

88

ci = ceph_inode(inode);

88

ci = ceph_inode(inode);

89

90

/*

90

/*

91

* Note that we're grabbing a snapc ref here without holding

91

* Note that we're grabbing a snapc ref here without holding

92

* any locks!

92

* any locks!

93

*/

93

*/

94

snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);

94

snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);

95

96

/* dirty the head */

96

/* dirty the head */

97

spin_lock(&ci->i_ceph_lock);

97

spin_lock(&ci->i_ceph_lock);

98

if (ci->i_head_snapc == NULL)

98

if (ci->i_head_snapc == NULL)

99

ci->i_head_snapc = ceph_get_snap_context(snapc);

99

ci->i_head_snapc = ceph_get_snap_context(snapc);

100

++ci->i_wrbuffer_ref_head;

100

++ci->i_wrbuffer_ref_head;

101

if (ci->i_wrbuffer_ref == 0)

101

if (ci->i_wrbuffer_ref == 0)

102

ihold(inode);

102

ihold(inode);

103

++ci->i_wrbuffer_ref;

103

++ci->i_wrbuffer_ref;

104

dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "

104

dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "

105

"snapc %p seq %lld (%d snaps)\n",

105

"snapc %p seq %lld (%d snaps)\n",

106

mapping->host, page, page->index,

106

mapping->host, page, page->index,

107

ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,

107

ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,

108

ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,

108

ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,

109

snapc, snapc->seq, snapc->num_snaps);

109

snapc, snapc->seq, snapc->num_snaps);

110

spin_unlock(&ci->i_ceph_lock);

110

spin_unlock(&ci->i_ceph_lock);

111

112

/*

112

/*

113

* Reference snap context in page->private. Also set

113

* Reference snap context in page->private. Also set

114

* PagePrivate so that we get invalidatepage callback.

114

* PagePrivate so that we get invalidatepage callback.

115

*/

115

*/

116

BUG_ON(PagePrivate(page));

116

BUG_ON(PagePrivate(page));

117

page->private = (unsigned long)snapc;

117

page->private = (unsigned long)snapc;

118

SetPagePrivate(page);

118

SetPagePrivate(page);

119

120

ret = __set_page_dirty_nobuffers(page);

120

ret = __set_page_dirty_nobuffers(page);

121

WARN_ON(!PageLocked(page));

121

WARN_ON(!PageLocked(page));

122

WARN_ON(!page->mapping);

122

WARN_ON(!page->mapping);

123

124

return ret;

124

return ret;

125

}

125

}

126

127

/*

127

/*

128

* If we are truncating the full page (i.e. offset == 0), adjust the

128

* If we are truncating the full page (i.e. offset == 0), adjust the

129

* dirty page counters appropriately. Only called if there is private

129

* dirty page counters appropriately. Only called if there is private

130

* data on the page.

130

* data on the page.

131

*/

131

*/

132

static void ceph_invalidatepage(struct page *page, unsigned int offset,

132

static void ceph_invalidatepage(struct page *page, unsigned int offset,

133

unsigned int length)

133

unsigned int length)

134

{

134

{

135

struct inode *inode;

135

struct inode *inode;

136

struct ceph_inode_info *ci;

136

struct ceph_inode_info *ci;

137

struct ceph_snap_context *snapc = page_snap_context(page);

137

struct ceph_snap_context *snapc = page_snap_context(page);

138

139

inode = page->mapping->host;

139

inode = page->mapping->host;

140

ci = ceph_inode(inode);

140

ci = ceph_inode(inode);

141

142

if (offset != 0 || length != PAGE_CACHE_SIZE) {

142

if (offset != 0 || length != PAGE_CACHE_SIZE) {

143

dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",

143

dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",

144

inode, page, page->index, offset, length);

144

inode, page, page->index, offset, length);

145

return;

145

return;

146

}

146

}

147

148

ceph_invalidate_fscache_page(inode, page);

148

ceph_invalidate_fscache_page(inode, page);

149

150

if (!PagePrivate(page))

150

if (!PagePrivate(page))

151

return;

151

return;

152

153

/*

153

/*

154

* We can get non-dirty pages here due to races between

154

* We can get non-dirty pages here due to races between

155

* set_page_dirty and truncate_complete_page; just spit out a

155

* set_page_dirty and truncate_complete_page; just spit out a

156

* warning, in case we end up with accounting problems later.

156

* warning, in case we end up with accounting problems later.

157

*/

157

*/

158

if (!PageDirty(page))

158

if (!PageDirty(page))

159

pr_err("%p invalidatepage %p page not dirty\n", inode, page);

159

pr_err("%p invalidatepage %p page not dirty\n", inode, page);

160

161

ClearPageChecked(page);

161

ClearPageChecked(page);

162

163

dout("%p invalidatepage %p idx %lu full dirty page\n",

163

dout("%p invalidatepage %p idx %lu full dirty page\n",

164

inode, page, page->index);

164

inode, page, page->index);

165

166

ceph_put_wrbuffer_cap_refs(ci, 1, snapc);

166

ceph_put_wrbuffer_cap_refs(ci, 1, snapc);

167

ceph_put_snap_context(snapc);

167

ceph_put_snap_context(snapc);

168

page->private = 0;

168

page->private = 0;

169

ClearPagePrivate(page);

169

ClearPagePrivate(page);

170

}

170

}

171

172

static int ceph_releasepage(struct page *page, gfp_t g)

172

static int ceph_releasepage(struct page *page, gfp_t g)

173

{

173

{

174

struct inode *inode = page->mapping ? page->mapping->host : NULL;

174

struct inode *inode = page->mapping ? page->mapping->host : NULL;

175

dout("%p releasepage %p idx %lu\n", inode, page, page->index);

175

dout("%p releasepage %p idx %lu\n", inode, page, page->index);

176

WARN_ON(PageDirty(page));

176

WARN_ON(PageDirty(page));

177

178

/* Can we release the page from the cache? */

178

/* Can we release the page from the cache? */

179

if (!ceph_release_fscache_page(page, g))

179

if (!ceph_release_fscache_page(page, g))

180

return 0;

180

return 0;

181

182

return !PagePrivate(page);

182

return !PagePrivate(page);

183

}

183

}

184

185

/*

185

/*

186

* read a single page, without unlocking it.

186

* read a single page, without unlocking it.

187

*/

187

*/

188

static int readpage_nounlock(struct file *filp, struct page *page)

188

static int readpage_nounlock(struct file *filp, struct page *page)

189

{

189

{

190

struct inode *inode = file_inode(filp);

190

struct inode *inode = file_inode(filp);

191

struct ceph_inode_info *ci = ceph_inode(inode);

191

struct ceph_inode_info *ci = ceph_inode(inode);

192

struct ceph_osd_client *osdc =

192

struct ceph_osd_client *osdc =

193

&ceph_inode_to_client(inode)->client->osdc;

193

&ceph_inode_to_client(inode)->client->osdc;

194

int err = 0;

194

int err = 0;

195

u64 len = PAGE_CACHE_SIZE;

195

u64 len = PAGE_CACHE_SIZE;

196

197

err = ceph_readpage_from_fscache(inode, page);

197

err = ceph_readpage_from_fscache(inode, page);

198

199

if (err == 0)

199

if (err == 0)

200

goto out;

200

goto out;

201

202

dout("readpage inode %p file %p page %p index %lu\n",

202

dout("readpage inode %p file %p page %p index %lu\n",

203

inode, filp, page, page->index);

203

inode, filp, page, page->index);

204

err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,

204

err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,

205

(u64) page_offset(page), &len,

205

(u64) page_offset(page), &len,

206

ci->i_truncate_seq, ci->i_truncate_size,

206

ci->i_truncate_seq, ci->i_truncate_size,

207

&page, 1, 0);

207

&page, 1, 0);

208

if (err == -ENOENT)

208

if (err == -ENOENT)

209

err = 0;

209

err = 0;

210

if (err < 0) {

210

if (err < 0) {

211

SetPageError(page);

211

SetPageError(page);

212

goto out;

212

goto out;

213

} else if (err < PAGE_CACHE_SIZE) {

213

} else if (err < PAGE_CACHE_SIZE) {

214

/* zero fill remainder of page */

214

/* zero fill remainder of page */

215

zero_user_segment(page, err, PAGE_CACHE_SIZE);

215

zero_user_segment(page, err, PAGE_CACHE_SIZE);

216

}

216

}

217

SetPageUptodate(page);

217

SetPageUptodate(page);

218

219

if (err == 0)

219

if (err >= 0)

220

ceph_readpage_to_fscache(inode, page);

220

ceph_readpage_to_fscache(inode, page);

221

222

out:

222

out:

223

return err < 0 ? err : 0;

223

return err < 0 ? err : 0;

224

}

224

}

225

226

static int ceph_readpage(struct file *filp, struct page *page)

226

static int ceph_readpage(struct file *filp, struct page *page)

227

{

227

{

228

int r = readpage_nounlock(filp, page);

228

int r = readpage_nounlock(filp, page);

229

unlock_page(page);

229

unlock_page(page);

230

return r;

230

return r;

231

}

231

}

232

233

/*

233

/*

234

* Finish an async read(ahead) op.

234

* Finish an async read(ahead) op.

235

*/

235

*/

236

static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)

236

static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)

237

{

237

{

238

struct inode *inode = req->r_inode;

238

struct inode *inode = req->r_inode;

239

struct ceph_osd_data *osd_data;

239

struct ceph_osd_data *osd_data;

240

int rc = req->r_result;

240

int rc = req->r_result;

241

int bytes = le32_to_cpu(msg->hdr.data_len);

241

int bytes = le32_to_cpu(msg->hdr.data_len);

242

int num_pages;

242

int num_pages;

243

int i;

243

int i;

244

245

dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);

245

dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);

246

247

/* unlock all pages, zeroing any data we didn't read */

247

/* unlock all pages, zeroing any data we didn't read */

248

osd_data = osd_req_op_extent_osd_data(req, 0);

248

osd_data = osd_req_op_extent_osd_data(req, 0);

249

BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);

249

BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);

250

num_pages = calc_pages_for((u64)osd_data->alignment,

250

num_pages = calc_pages_for((u64)osd_data->alignment,

251

(u64)osd_data->length);

251

(u64)osd_data->length);

252

for (i = 0; i < num_pages; i++) {

252

for (i = 0; i < num_pages; i++) {

253

struct page *page = osd_data->pages[i];

253

struct page *page = osd_data->pages[i];

254

255

if (bytes < (int)PAGE_CACHE_SIZE) {

255

if (bytes < (int)PAGE_CACHE_SIZE) {

256

/* zero (remainder of) page */

256

/* zero (remainder of) page */

257

int s = bytes < 0 ? 0 : bytes;

257

int s = bytes < 0 ? 0 : bytes;

258

zero_user_segment(page, s, PAGE_CACHE_SIZE);

258

zero_user_segment(page, s, PAGE_CACHE_SIZE);

259

}

259

}

260

dout("finish_read %p uptodate %p idx %lu\n", inode, page,

260

dout("finish_read %p uptodate %p idx %lu\n", inode, page,

261

page->index);

261

page->index);

262

flush_dcache_page(page);

262

flush_dcache_page(page);

263

SetPageUptodate(page);

263

SetPageUptodate(page);

264

ceph_readpage_to_fscache(inode, page);

264

ceph_readpage_to_fscache(inode, page);

265

unlock_page(page);

265

unlock_page(page);

266

page_cache_release(page);

266

page_cache_release(page);

267

bytes -= PAGE_CACHE_SIZE;

267

bytes -= PAGE_CACHE_SIZE;

268

}

268

}

269

kfree(osd_data->pages);

269

kfree(osd_data->pages);

270

}

270

}

271

272

static void ceph_unlock_page_vector(struct page **pages, int num_pages)

272

static void ceph_unlock_page_vector(struct page **pages, int num_pages)

273

{

273

{

274

int i;

274

int i;

275

276

for (i = 0; i < num_pages; i++)

276

for (i = 0; i < num_pages; i++)

277

unlock_page(pages[i]);

277

unlock_page(pages[i]);

278

}

278

}

279

280

/*

280

/*

281

* start an async read(ahead) operation. return nr_pages we submitted

281

* start an async read(ahead) operation. return nr_pages we submitted

282

* a read for on success, or negative error code.

282

* a read for on success, or negative error code.

283

*/

283

*/

284

static int start_read(struct inode *inode, struct list_head *page_list, int max)

284

static int start_read(struct inode *inode, struct list_head *page_list, int max)

285

{

285

{

286

struct ceph_osd_client *osdc =

286

struct ceph_osd_client *osdc =

287

&ceph_inode_to_client(inode)->client->osdc;

287

&ceph_inode_to_client(inode)->client->osdc;

288

struct ceph_inode_info *ci = ceph_inode(inode);

288

struct ceph_inode_info *ci = ceph_inode(inode);

289

struct page *page = list_entry(page_list->prev, struct page, lru);

289

struct page *page = list_entry(page_list->prev, struct page, lru);

290

struct ceph_vino vino;

290

struct ceph_vino vino;

291

struct ceph_osd_request *req;

291

struct ceph_osd_request *req;

292

u64 off;

292

u64 off;

293

u64 len;

293

u64 len;

294

int i;

294

int i;

295

struct page **pages;

295

struct page **pages;

296

pgoff_t next_index;

296

pgoff_t next_index;

297

int nr_pages = 0;

297

int nr_pages = 0;

298

int ret;

298

int ret;

299

300

off = (u64) page_offset(page);

300

off = (u64) page_offset(page);

301

302

/* count pages */

302

/* count pages */

303

next_index = page->index;

303

next_index = page->index;

304

list_for_each_entry_reverse(page, page_list, lru) {

304

list_for_each_entry_reverse(page, page_list, lru) {

305

if (page->index != next_index)

305

if (page->index != next_index)

306

break;

306

break;

307

nr_pages++;

307

nr_pages++;

308

next_index++;

308

next_index++;

309

if (max && nr_pages == max)

309

if (max && nr_pages == max)

310

break;

310

break;

311

}

311

}

312

len = nr_pages << PAGE_CACHE_SHIFT;

312

len = nr_pages << PAGE_CACHE_SHIFT;

313

dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,

313

dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,

314

off, len);

314

off, len);

315

vino = ceph_vino(inode);

315

vino = ceph_vino(inode);

316

req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,

316

req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,

317

1, CEPH_OSD_OP_READ,

317

1, CEPH_OSD_OP_READ,

318

CEPH_OSD_FLAG_READ, NULL,

318

CEPH_OSD_FLAG_READ, NULL,

319

ci->i_truncate_seq, ci->i_truncate_size,

319

ci->i_truncate_seq, ci->i_truncate_size,

320

false);

320

false);

321

if (IS_ERR(req))

321

if (IS_ERR(req))

322

return PTR_ERR(req);

322

return PTR_ERR(req);

323

324

/* build page vector */

324

/* build page vector */

325

nr_pages = calc_pages_for(0, len);

325

nr_pages = calc_pages_for(0, len);

326

pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);

326

pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);

327

ret = -ENOMEM;

327

ret = -ENOMEM;

328

if (!pages)

328

if (!pages)

329

goto out;

329

goto out;

330

for (i = 0; i < nr_pages; ++i) {

330

for (i = 0; i < nr_pages; ++i) {

331

page = list_entry(page_list->prev, struct page, lru);

331

page = list_entry(page_list->prev, struct page, lru);

332

BUG_ON(PageLocked(page));

332

BUG_ON(PageLocked(page));

333

list_del(&page->lru);

333

list_del(&page->lru);

334

335

dout("start_read %p adding %p idx %lu\n", inode, page,

335

dout("start_read %p adding %p idx %lu\n", inode, page,

336

page->index);

336

page->index);

337

if (add_to_page_cache_lru(page, &inode->i_data, page->index,

337

if (add_to_page_cache_lru(page, &inode->i_data, page->index,

338

GFP_NOFS)) {

338

GFP_NOFS)) {

339

ceph_fscache_uncache_page(inode, page);

339

ceph_fscache_uncache_page(inode, page);

340

page_cache_release(page);

340

page_cache_release(page);

341

dout("start_read %p add_to_page_cache failed %p\n",

341

dout("start_read %p add_to_page_cache failed %p\n",

342

inode, page);

342

inode, page);

343

nr_pages = i;

343

nr_pages = i;

344

goto out_pages;

344

goto out_pages;

345

}

345

}

346

pages[i] = page;

346

pages[i] = page;

347

}

347

}

348

osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);

348

osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);

349

req->r_callback = finish_read;

349

req->r_callback = finish_read;

350

req->r_inode = inode;

350

req->r_inode = inode;

351

352

ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);

352

ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);

353

354

dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);

354

dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);

355

ret = ceph_osdc_start_request(osdc, req, false);

355

ret = ceph_osdc_start_request(osdc, req, false);

356

if (ret < 0)

356

if (ret < 0)

357

goto out_pages;

357

goto out_pages;

358

ceph_osdc_put_request(req);

358

ceph_osdc_put_request(req);

359

return nr_pages;

359

return nr_pages;

360

361

out_pages:

361

out_pages:

362

ceph_unlock_page_vector(pages, nr_pages);

362

ceph_unlock_page_vector(pages, nr_pages);

363

ceph_release_page_vector(pages, nr_pages);

363

ceph_release_page_vector(pages, nr_pages);

364

out:

364

out:

365

ceph_osdc_put_request(req);

365

ceph_osdc_put_request(req);

366

return ret;

366

return ret;

367

}

367

}

368

369

370

/*

370

/*

371

* Read multiple pages. Leave pages we don't read + unlock in page_list;

371

* Read multiple pages. Leave pages we don't read + unlock in page_list;

372

* the caller (VM) cleans them up.

372

* the caller (VM) cleans them up.

373

*/

373

*/

374

static int ceph_readpages(struct file *file, struct address_space *mapping,

374

static int ceph_readpages(struct file *file, struct address_space *mapping,

375

struct list_head *page_list, unsigned nr_pages)

375

struct list_head *page_list, unsigned nr_pages)

376

{

376

{

377

struct inode *inode = file_inode(file);

377

struct inode *inode = file_inode(file);

378

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

378

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

379

int rc = 0;

379

int rc = 0;

380

int max = 0;

380

int max = 0;

381

382

rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,

382

rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,

383

&nr_pages);

383

&nr_pages);

384

385

if (rc == 0)

385

if (rc == 0)

386

goto out;

386

goto out;

387

388

if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)

388

if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)

389

max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)

389

max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)

390

>> PAGE_SHIFT;

390

>> PAGE_SHIFT;

391

392

dout("readpages %p file %p nr_pages %d max %d\n", inode,

392

dout("readpages %p file %p nr_pages %d max %d\n", inode,

393

file, nr_pages,

393

file, nr_pages,

394

max);

394

max);

395

while (!list_empty(page_list)) {

395

while (!list_empty(page_list)) {

396

rc = start_read(inode, page_list, max);

396

rc = start_read(inode, page_list, max);

397

if (rc < 0)

397

if (rc < 0)

398

goto out;

398

goto out;

399

BUG_ON(rc == 0);

399

BUG_ON(rc == 0);

400

}

400

}

401

out:

401

out:

402

ceph_fscache_readpages_cancel(inode, page_list);

402

ceph_fscache_readpages_cancel(inode, page_list);

403

404

dout("readpages %p file %p ret %d\n", inode, file, rc);

404

dout("readpages %p file %p ret %d\n", inode, file, rc);

405

return rc;

405

return rc;

406

}

406

}

407

408

/*

408

/*

409

* Get ref for the oldest snapc for an inode with dirty data... that is, the

409

* Get ref for the oldest snapc for an inode with dirty data... that is, the

410

* only snap context we are allowed to write back.

410

* only snap context we are allowed to write back.

411

*/

411

*/

412

static struct ceph_snap_context *get_oldest_context(struct inode *inode,

412

static struct ceph_snap_context *get_oldest_context(struct inode *inode,

413

u64 *snap_size)

413

u64 *snap_size)

414

{

414

{

415

struct ceph_inode_info *ci = ceph_inode(inode);

415

struct ceph_inode_info *ci = ceph_inode(inode);

416

struct ceph_snap_context *snapc = NULL;

416

struct ceph_snap_context *snapc = NULL;

417

struct ceph_cap_snap *capsnap = NULL;

417

struct ceph_cap_snap *capsnap = NULL;

418

419

spin_lock(&ci->i_ceph_lock);

419

spin_lock(&ci->i_ceph_lock);

420

list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {

420

list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {

421

dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,

421

dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,

422

capsnap->context, capsnap->dirty_pages);

422

capsnap->context, capsnap->dirty_pages);

423

if (capsnap->dirty_pages) {

423

if (capsnap->dirty_pages) {

424

snapc = ceph_get_snap_context(capsnap->context);

424

snapc = ceph_get_snap_context(capsnap->context);

425

if (snap_size)

425

if (snap_size)

426

*snap_size = capsnap->size;

426

*snap_size = capsnap->size;

427

break;

427

break;

428

}

428

}

429

}

429

}

430

if (!snapc && ci->i_wrbuffer_ref_head) {

430

if (!snapc && ci->i_wrbuffer_ref_head) {

431

snapc = ceph_get_snap_context(ci->i_head_snapc);

431

snapc = ceph_get_snap_context(ci->i_head_snapc);

432

dout(" head snapc %p has %d dirty pages\n",

432

dout(" head snapc %p has %d dirty pages\n",

433

snapc, ci->i_wrbuffer_ref_head);

433

snapc, ci->i_wrbuffer_ref_head);

434

}

434

}

435

spin_unlock(&ci->i_ceph_lock);

435

spin_unlock(&ci->i_ceph_lock);

436

return snapc;

436

return snapc;

437

}

437

}

438

439

/*

439

/*

440

* Write a single page, but leave the page locked.

440

* Write a single page, but leave the page locked.

441

*

441

*

442

* If we get a write error, set the page error bit, but still adjust the

442

* If we get a write error, set the page error bit, but still adjust the

443

* dirty page accounting (i.e., page is no longer dirty).

443

* dirty page accounting (i.e., page is no longer dirty).

444

*/

444

*/

445

static int writepage_nounlock(struct page *page, struct writeback_control *wbc)

445

static int writepage_nounlock(struct page *page, struct writeback_control *wbc)

446

{

446

{

447

struct inode *inode;

447

struct inode *inode;

448

struct ceph_inode_info *ci;

448

struct ceph_inode_info *ci;

449

struct ceph_fs_client *fsc;

449

struct ceph_fs_client *fsc;

450

struct ceph_osd_client *osdc;

450

struct ceph_osd_client *osdc;

451

struct ceph_snap_context *snapc, *oldest;

451

struct ceph_snap_context *snapc, *oldest;

452

loff_t page_off = page_offset(page);

452

loff_t page_off = page_offset(page);

453

long writeback_stat;

453

long writeback_stat;

454

u64 truncate_size, snap_size = 0;

454

u64 truncate_size, snap_size = 0;

455

u32 truncate_seq;

455

u32 truncate_seq;

456

int err = 0, len = PAGE_CACHE_SIZE;

456

int err = 0, len = PAGE_CACHE_SIZE;

457

458

dout("writepage %p idx %lu\n", page, page->index);

458

dout("writepage %p idx %lu\n", page, page->index);

459

460

if (!page->mapping || !page->mapping->host) {

460

if (!page->mapping || !page->mapping->host) {

461

dout("writepage %p - no mapping\n", page);

461

dout("writepage %p - no mapping\n", page);

462

return -EFAULT;

462

return -EFAULT;

463

}

463

}

464

inode = page->mapping->host;

464

inode = page->mapping->host;

465

ci = ceph_inode(inode);

465

ci = ceph_inode(inode);

466

fsc = ceph_inode_to_client(inode);

466

fsc = ceph_inode_to_client(inode);

467

osdc = &fsc->client->osdc;

467

osdc = &fsc->client->osdc;

468

469

/* verify this is a writeable snap context */

469

/* verify this is a writeable snap context */

470

snapc = page_snap_context(page);

470

snapc = page_snap_context(page);

471

if (snapc == NULL) {

471

if (snapc == NULL) {

472

dout("writepage %p page %p not dirty?\n", inode, page);

472

dout("writepage %p page %p not dirty?\n", inode, page);

473

goto out;

473

goto out;

474

}

474

}

475

oldest = get_oldest_context(inode, &snap_size);

475

oldest = get_oldest_context(inode, &snap_size);

476

if (snapc->seq > oldest->seq) {

476

if (snapc->seq > oldest->seq) {

477

dout("writepage %p page %p snapc %p not writeable - noop\n",

477

dout("writepage %p page %p snapc %p not writeable - noop\n",

478

inode, page, snapc);

478

inode, page, snapc);

479

/* we should only noop if called by kswapd */

479

/* we should only noop if called by kswapd */

480

WARN_ON((current->flags & PF_MEMALLOC) == 0);

480

WARN_ON((current->flags & PF_MEMALLOC) == 0);

481

ceph_put_snap_context(oldest);

481

ceph_put_snap_context(oldest);

482

goto out;

482

goto out;

483

}

483

}

484

ceph_put_snap_context(oldest);

484

ceph_put_snap_context(oldest);

485

486

spin_lock(&ci->i_ceph_lock);

486

spin_lock(&ci->i_ceph_lock);

487

truncate_seq = ci->i_truncate_seq;

487

truncate_seq = ci->i_truncate_seq;

488

truncate_size = ci->i_truncate_size;

488

truncate_size = ci->i_truncate_size;

489

if (!snap_size)

489

if (!snap_size)

490

snap_size = i_size_read(inode);

490

snap_size = i_size_read(inode);

491

spin_unlock(&ci->i_ceph_lock);

491

spin_unlock(&ci->i_ceph_lock);

492

493

/* is this a partial page at end of file? */

493

/* is this a partial page at end of file? */

494

if (page_off >= snap_size) {

494

if (page_off >= snap_size) {

495

dout("%p page eof %llu\n", page, snap_size);

495

dout("%p page eof %llu\n", page, snap_size);

496

goto out;

496

goto out;

497

}

497

}

498

if (snap_size < page_off + len)

498

if (snap_size < page_off + len)

499

len = snap_size - page_off;

499

len = snap_size - page_off;

500

501

dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",

501

dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",

502

inode, page, page->index, page_off, len, snapc);

502

inode, page, page->index, page_off, len, snapc);

503

504

writeback_stat = atomic_long_inc_return(&fsc->writeback_count);

504

writeback_stat = atomic_long_inc_return(&fsc->writeback_count);

505

if (writeback_stat >

505

if (writeback_stat >

506

CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))

506

CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))

507

set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);

507

set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);

508

509

ceph_readpage_to_fscache(inode, page);

509

ceph_readpage_to_fscache(inode, page);

510

511

set_page_writeback(page);

511

set_page_writeback(page);

512

err = ceph_osdc_writepages(osdc, ceph_vino(inode),

512

err = ceph_osdc_writepages(osdc, ceph_vino(inode),

513

&ci->i_layout, snapc,

513

&ci->i_layout, snapc,

514

page_off, len,

514

page_off, len,

515

truncate_seq, truncate_size,

515

truncate_seq, truncate_size,

516

&inode->i_mtime, &page, 1);

516

&inode->i_mtime, &page, 1);

517

if (err < 0) {

517

if (err < 0) {

518

dout("writepage setting page/mapping error %d %p\n", err, page);

518

dout("writepage setting page/mapping error %d %p\n", err, page);

519

SetPageError(page);

519

SetPageError(page);

520

mapping_set_error(&inode->i_data, err);

520

mapping_set_error(&inode->i_data, err);

521

if (wbc)

521

if (wbc)

522

wbc->pages_skipped++;

522

wbc->pages_skipped++;

523

} else {

523

} else {

524

dout("writepage cleaned page %p\n", page);

524

dout("writepage cleaned page %p\n", page);

525

err = 0; /* vfs expects us to return 0 */

525

err = 0; /* vfs expects us to return 0 */

526

}

526

}

527

page->private = 0;

527

page->private = 0;

528

ClearPagePrivate(page);

528

ClearPagePrivate(page);

529

end_page_writeback(page);

529

end_page_writeback(page);

530

ceph_put_wrbuffer_cap_refs(ci, 1, snapc);

530

ceph_put_wrbuffer_cap_refs(ci, 1, snapc);

531

ceph_put_snap_context(snapc); /* page's reference */

531

ceph_put_snap_context(snapc); /* page's reference */

532

out:

532

out:

533

return err;

533

return err;

534

}

534

}

535

536

static int ceph_writepage(struct page *page, struct writeback_control *wbc)

536

static int ceph_writepage(struct page *page, struct writeback_control *wbc)

537

{

537

{

538

int err;

538

int err;

539

struct inode *inode = page->mapping->host;

539

struct inode *inode = page->mapping->host;

540

BUG_ON(!inode);

540

BUG_ON(!inode);

541

ihold(inode);

541

ihold(inode);

542

err = writepage_nounlock(page, wbc);

542

err = writepage_nounlock(page, wbc);

543

unlock_page(page);

543

unlock_page(page);

544

iput(inode);

544

iput(inode);

545

return err;

545

return err;

546

}

546

}

547

548

549

/*

549

/*

550

* lame release_pages helper. release_pages() isn't exported to

550

* lame release_pages helper. release_pages() isn't exported to

551

* modules.

551

* modules.

552

*/

552

*/

553

static void ceph_release_pages(struct page **pages, int num)

553

static void ceph_release_pages(struct page **pages, int num)

554

{

554

{

555

struct pagevec pvec;

555

struct pagevec pvec;

556

int i;

556

int i;

557

558

pagevec_init(&pvec, 0);

558

pagevec_init(&pvec, 0);

559

for (i = 0; i < num; i++) {

559

for (i = 0; i < num; i++) {

560

if (pagevec_add(&pvec, pages[i]) == 0)

560

if (pagevec_add(&pvec, pages[i]) == 0)

561

pagevec_release(&pvec);

561

pagevec_release(&pvec);

562

}

562

}

563

pagevec_release(&pvec);

563

pagevec_release(&pvec);

564

}

564

}

565

566

/*

566

/*

567

* async writeback completion handler.

567

* async writeback completion handler.

568

*

568

*

569

* If we get an error, set the mapping error bit, but not the individual

569

* If we get an error, set the mapping error bit, but not the individual

570

* page error bits.

570

* page error bits.

571

*/

571

*/

572

static void writepages_finish(struct ceph_osd_request *req,

572

static void writepages_finish(struct ceph_osd_request *req,

573

struct ceph_msg *msg)

573

struct ceph_msg *msg)

574

{

574

{

575

struct inode *inode = req->r_inode;

575

struct inode *inode = req->r_inode;

576

struct ceph_inode_info *ci = ceph_inode(inode);

576

struct ceph_inode_info *ci = ceph_inode(inode);

577

struct ceph_osd_data *osd_data;

577

struct ceph_osd_data *osd_data;

578

unsigned wrote;

578

unsigned wrote;

579

struct page *page;

579

struct page *page;

580

int num_pages;

580

int num_pages;

581

int i;

581

int i;

582

struct ceph_snap_context *snapc = req->r_snapc;

582

struct ceph_snap_context *snapc = req->r_snapc;

583

struct address_space *mapping = inode->i_mapping;

583

struct address_space *mapping = inode->i_mapping;

584

int rc = req->r_result;

584

int rc = req->r_result;

585

u64 bytes = req->r_ops[0].extent.length;

585

u64 bytes = req->r_ops[0].extent.length;

586

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

586

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

587

long writeback_stat;

587

long writeback_stat;

588

unsigned issued = ceph_caps_issued(ci);

588

unsigned issued = ceph_caps_issued(ci);

589

590

osd_data = osd_req_op_extent_osd_data(req, 0);

590

osd_data = osd_req_op_extent_osd_data(req, 0);

591

BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);

591

BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);

592

num_pages = calc_pages_for((u64)osd_data->alignment,

592

num_pages = calc_pages_for((u64)osd_data->alignment,

593

(u64)osd_data->length);

593

(u64)osd_data->length);

594

if (rc >= 0) {

594

if (rc >= 0) {

595

/*

595

/*

596

* Assume we wrote the pages we originally sent. The

596

* Assume we wrote the pages we originally sent. The

597

* osd might reply with fewer pages if our writeback

597

* osd might reply with fewer pages if our writeback

598

* raced with a truncation and was adjusted at the osd,

598

* raced with a truncation and was adjusted at the osd,

599

* so don't believe the reply.

599

* so don't believe the reply.

600

*/

600

*/

601

wrote = num_pages;

601

wrote = num_pages;

602

} else {

602

} else {

603

wrote = 0;

603

wrote = 0;

604

mapping_set_error(mapping, rc);

604

mapping_set_error(mapping, rc);

605

}

605

}

606

dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",

606

dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",

607

inode, rc, bytes, wrote);

607

inode, rc, bytes, wrote);

608

609

/* clean all pages */

609

/* clean all pages */

610

for (i = 0; i < num_pages; i++) {

610

for (i = 0; i < num_pages; i++) {

611

page = osd_data->pages[i];

611

page = osd_data->pages[i];

612

BUG_ON(!page);

612

BUG_ON(!page);

613

WARN_ON(!PageUptodate(page));

613

WARN_ON(!PageUptodate(page));

614

615

writeback_stat =

615

writeback_stat =

616

atomic_long_dec_return(&fsc->writeback_count);

616

atomic_long_dec_return(&fsc->writeback_count);

617

if (writeback_stat <

617

if (writeback_stat <

618

CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))

618

CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))

619

clear_bdi_congested(&fsc->backing_dev_info,

619

clear_bdi_congested(&fsc->backing_dev_info,

620

BLK_RW_ASYNC);

620

BLK_RW_ASYNC);

621

622

ceph_put_snap_context(page_snap_context(page));

622

ceph_put_snap_context(page_snap_context(page));

623

page->private = 0;

623

page->private = 0;

624

ClearPagePrivate(page);

624

ClearPagePrivate(page);

625

dout("unlocking %d %p\n", i, page);

625

dout("unlocking %d %p\n", i, page);

626

end_page_writeback(page);

626

end_page_writeback(page);

627

628

/*

628

/*

629

* We lost the cache cap, need to truncate the page before

629

* We lost the cache cap, need to truncate the page before

630

* it is unlocked, otherwise we'd truncate it later in the

630

* it is unlocked, otherwise we'd truncate it later in the

631

* page truncation thread, possibly losing some data that

631

* page truncation thread, possibly losing some data that

632

* raced its way in

632

* raced its way in

633

*/

633

*/

634

if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)

634

if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)

635

generic_error_remove_page(inode->i_mapping, page);

635

generic_error_remove_page(inode->i_mapping, page);

636

637

unlock_page(page);

637

unlock_page(page);

638

}

638

}

639

dout("%p wrote+cleaned %d pages\n", inode, wrote);

639

dout("%p wrote+cleaned %d pages\n", inode, wrote);

640

ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);

640

ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);

641

642

ceph_release_pages(osd_data->pages, num_pages);

642

ceph_release_pages(osd_data->pages, num_pages);

643

if (osd_data->pages_from_pool)

643

if (osd_data->pages_from_pool)

644

mempool_free(osd_data->pages,

644

mempool_free(osd_data->pages,

645

ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);

645

ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);

646

else

646

else

647

kfree(osd_data->pages);

647

kfree(osd_data->pages);

648

ceph_osdc_put_request(req);

648

ceph_osdc_put_request(req);

649

}

649

}

650

651

/*

651

/*

652

* initiate async writeback

652

* initiate async writeback

653

*/

653

*/

654

static int ceph_writepages_start(struct address_space *mapping,

654

static int ceph_writepages_start(struct address_space *mapping,

655

struct writeback_control *wbc)

655

struct writeback_control *wbc)

656

{

656

{

657

struct inode *inode = mapping->host;

657

struct inode *inode = mapping->host;

658

struct ceph_inode_info *ci = ceph_inode(inode);

658

struct ceph_inode_info *ci = ceph_inode(inode);

659

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

659

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

660

struct ceph_vino vino = ceph_vino(inode);

660

struct ceph_vino vino = ceph_vino(inode);

661

pgoff_t index, start, end;

661

pgoff_t index, start, end;

662

int range_whole = 0;

662

int range_whole = 0;

663

int should_loop = 1;

663

int should_loop = 1;

664

pgoff_t max_pages = 0, max_pages_ever = 0;

664

pgoff_t max_pages = 0, max_pages_ever = 0;

665

struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;

665

struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;

666

struct pagevec pvec;

666

struct pagevec pvec;

667

int done = 0;

667

int done = 0;

668

int rc = 0;

668

int rc = 0;

669

unsigned wsize = 1 << inode->i_blkbits;

669

unsigned wsize = 1 << inode->i_blkbits;

670

struct ceph_osd_request *req = NULL;

670

struct ceph_osd_request *req = NULL;

671

int do_sync;

671

int do_sync;

672

u64 truncate_size, snap_size;

672

u64 truncate_size, snap_size;

673

u32 truncate_seq;

673

u32 truncate_seq;

674

675

/*

675

/*

676

* Include a 'sync' in the OSD request if this is a data

676

* Include a 'sync' in the OSD request if this is a data

677

* integrity write (e.g., O_SYNC write or fsync()), or if our

677

* integrity write (e.g., O_SYNC write or fsync()), or if our

678

* cap is being revoked.

678

* cap is being revoked.

679

*/

679

*/

680

if ((wbc->sync_mode == WB_SYNC_ALL) ||

680

if ((wbc->sync_mode == WB_SYNC_ALL) ||

681

ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))

681

ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))

682

do_sync = 1;

682

do_sync = 1;

683

dout("writepages_start %p dosync=%d (mode=%s)\n",

683

dout("writepages_start %p dosync=%d (mode=%s)\n",

684

inode, do_sync,

684

inode, do_sync,

685

wbc->sync_mode == WB_SYNC_NONE ? "NONE" :

685

wbc->sync_mode == WB_SYNC_NONE ? "NONE" :

686

(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));

686

(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));

687

688

if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {

688

if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {

689

pr_warning("writepage_start %p on forced umount\n", inode);

689

pr_warning("writepage_start %p on forced umount\n", inode);

690

return -EIO; /* we're in a forced umount, don't write! */

690

return -EIO; /* we're in a forced umount, don't write! */

691

}

691

}

692

if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)

692

if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)

693

wsize = fsc->mount_options->wsize;

693

wsize = fsc->mount_options->wsize;

694

if (wsize < PAGE_CACHE_SIZE)

694

if (wsize < PAGE_CACHE_SIZE)

695

wsize = PAGE_CACHE_SIZE;

695

wsize = PAGE_CACHE_SIZE;

696

max_pages_ever = wsize >> PAGE_CACHE_SHIFT;

696

max_pages_ever = wsize >> PAGE_CACHE_SHIFT;

697

698

pagevec_init(&pvec, 0);

698

pagevec_init(&pvec, 0);

699

700

/* where to start/end? */

700

/* where to start/end? */

701

if (wbc->range_cyclic) {

701

if (wbc->range_cyclic) {

702

start = mapping->writeback_index; /* Start from prev offset */

702

start = mapping->writeback_index; /* Start from prev offset */

703

end = -1;

703

end = -1;

704

dout(" cyclic, start at %lu\n", start);

704

dout(" cyclic, start at %lu\n", start);

705

} else {

705

} else {

706

start = wbc->range_start >> PAGE_CACHE_SHIFT;

706

start = wbc->range_start >> PAGE_CACHE_SHIFT;

707

end = wbc->range_end >> PAGE_CACHE_SHIFT;

707

end = wbc->range_end >> PAGE_CACHE_SHIFT;

708

if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)

708

if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)

709

range_whole = 1;

709

range_whole = 1;

710

should_loop = 0;

710

should_loop = 0;

711

dout(" not cyclic, %lu to %lu\n", start, end);

711

dout(" not cyclic, %lu to %lu\n", start, end);

712

}

712

}

713

index = start;

713

index = start;

714

715

retry:

715

retry:

716

/* find oldest snap context with dirty data */

716

/* find oldest snap context with dirty data */

717

ceph_put_snap_context(snapc);

717

ceph_put_snap_context(snapc);

718

snap_size = 0;

718

snap_size = 0;

719

snapc = get_oldest_context(inode, &snap_size);

719

snapc = get_oldest_context(inode, &snap_size);

720

if (!snapc) {

720

if (!snapc) {

721

/* hmm, why does writepages get called when there

721

/* hmm, why does writepages get called when there

722

is no dirty data? */

722

is no dirty data? */

723

dout(" no snap context with dirty data?\n");

723

dout(" no snap context with dirty data?\n");

724

goto out;

724

goto out;

725

}

725

}

726

if (snap_size == 0)

726

if (snap_size == 0)

727

snap_size = i_size_read(inode);

727

snap_size = i_size_read(inode);

728

dout(" oldest snapc is %p seq %lld (%d snaps)\n",

728

dout(" oldest snapc is %p seq %lld (%d snaps)\n",

729

snapc, snapc->seq, snapc->num_snaps);

729

snapc, snapc->seq, snapc->num_snaps);

730

731

spin_lock(&ci->i_ceph_lock);

731

spin_lock(&ci->i_ceph_lock);

732

truncate_seq = ci->i_truncate_seq;

732

truncate_seq = ci->i_truncate_seq;

733

truncate_size = ci->i_truncate_size;

733

truncate_size = ci->i_truncate_size;

734

if (!snap_size)

734

if (!snap_size)

735

snap_size = i_size_read(inode);

735

snap_size = i_size_read(inode);

736

spin_unlock(&ci->i_ceph_lock);

736

spin_unlock(&ci->i_ceph_lock);

737

738

if (last_snapc && snapc != last_snapc) {

738

if (last_snapc && snapc != last_snapc) {

739

/* if we switched to a newer snapc, restart our scan at the

739

/* if we switched to a newer snapc, restart our scan at the

740

* start of the original file range. */

740

* start of the original file range. */

741

dout(" snapc differs from last pass, restarting at %lu\n",

741

dout(" snapc differs from last pass, restarting at %lu\n",

742

index);

742

index);

743

index = start;

743

index = start;

744

}

744

}

745

last_snapc = snapc;

745

last_snapc = snapc;

746

747

while (!done && index <= end) {

747

while (!done && index <= end) {

748

int num_ops = do_sync ? 2 : 1;

748

int num_ops = do_sync ? 2 : 1;

749

unsigned i;

749

unsigned i;

750

int first;

750

int first;

751

pgoff_t next;

751

pgoff_t next;

752

int pvec_pages, locked_pages;

752

int pvec_pages, locked_pages;

753

struct page **pages = NULL;

753

struct page **pages = NULL;

754

mempool_t *pool = NULL; /* Becomes non-null if mempool used */

754

mempool_t *pool = NULL; /* Becomes non-null if mempool used */

755

struct page *page;

755

struct page *page;

756

int want;

756

int want;

757

u64 offset, len;

757

u64 offset, len;

758

long writeback_stat;

758

long writeback_stat;

759

760

next = 0;

760

next = 0;

761

locked_pages = 0;

761

locked_pages = 0;

762

max_pages = max_pages_ever;

762

max_pages = max_pages_ever;

763

764

get_more_pages:

764

get_more_pages:

765

first = -1;

765

first = -1;

766

want = min(end - index,

766

want = min(end - index,

767

min((pgoff_t)PAGEVEC_SIZE,

767

min((pgoff_t)PAGEVEC_SIZE,

768

max_pages - (pgoff_t)locked_pages) - 1)

768

max_pages - (pgoff_t)locked_pages) - 1)

769

+ 1;

769

+ 1;

770

pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,

770

pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,

771

PAGECACHE_TAG_DIRTY,

771

PAGECACHE_TAG_DIRTY,

772

want);

772

want);

773

dout("pagevec_lookup_tag got %d\n", pvec_pages);

773

dout("pagevec_lookup_tag got %d\n", pvec_pages);

774

if (!pvec_pages && !locked_pages)

774

if (!pvec_pages && !locked_pages)

775

break;

775

break;

776

for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {

776

for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {

777

page = pvec.pages[i];

777

page = pvec.pages[i];

778

dout("? %p idx %lu\n", page, page->index);

778

dout("? %p idx %lu\n", page, page->index);

779

if (locked_pages == 0)

779

if (locked_pages == 0)

780

lock_page(page); /* first page */

780

lock_page(page); /* first page */

781

else if (!trylock_page(page))

781

else if (!trylock_page(page))

782

break;

782

break;

783

784

/* only dirty pages, or our accounting breaks */

784

/* only dirty pages, or our accounting breaks */

785

if (unlikely(!PageDirty(page)) ||

785

if (unlikely(!PageDirty(page)) ||

786

unlikely(page->mapping != mapping)) {

786

unlikely(page->mapping != mapping)) {

787

dout("!dirty or !mapping %p\n", page);

787

dout("!dirty or !mapping %p\n", page);

788

unlock_page(page);

788

unlock_page(page);

789

break;

789

break;

790

}

790

}

791

if (!wbc->range_cyclic && page->index > end) {

791

if (!wbc->range_cyclic && page->index > end) {

792

dout("end of range %p\n", page);

792

dout("end of range %p\n", page);

793

done = 1;

793

done = 1;

794

unlock_page(page);

794

unlock_page(page);

795

break;

795

break;

796

}

796

}

797

if (next && (page->index != next)) {

797

if (next && (page->index != next)) {

798

dout("not consecutive %p\n", page);

798

dout("not consecutive %p\n", page);

799

unlock_page(page);

799

unlock_page(page);

800

break;

800

break;

801

}

801

}

802

if (wbc->sync_mode != WB_SYNC_NONE) {

802

if (wbc->sync_mode != WB_SYNC_NONE) {

803

dout("waiting on writeback %p\n", page);

803

dout("waiting on writeback %p\n", page);

804

wait_on_page_writeback(page);

804

wait_on_page_writeback(page);

805

}

805

}

806

if (page_offset(page) >= snap_size) {

806

if (page_offset(page) >= snap_size) {

807

dout("%p page eof %llu\n", page, snap_size);

807

dout("%p page eof %llu\n", page, snap_size);

808

done = 1;

808

done = 1;

809

unlock_page(page);

809

unlock_page(page);

810

break;

810

break;

811

}

811

}

812

if (PageWriteback(page)) {

812

if (PageWriteback(page)) {

813

dout("%p under writeback\n", page);

813

dout("%p under writeback\n", page);

814

unlock_page(page);

814

unlock_page(page);

815

break;

815

break;

816

}

816

}

817

818

/* only if matching snap context */

818

/* only if matching snap context */

819

pgsnapc = page_snap_context(page);

819

pgsnapc = page_snap_context(page);

820

if (pgsnapc->seq > snapc->seq) {

820

if (pgsnapc->seq > snapc->seq) {

821

dout("page snapc %p %lld > oldest %p %lld\n",

821

dout("page snapc %p %lld > oldest %p %lld\n",

822

pgsnapc, pgsnapc->seq, snapc, snapc->seq);

822

pgsnapc, pgsnapc->seq, snapc, snapc->seq);

823

unlock_page(page);

823

unlock_page(page);

824

if (!locked_pages)

824

if (!locked_pages)

825

continue; /* keep looking for snap */

825

continue; /* keep looking for snap */

826

break;

826

break;

827

}

827

}

828

829

if (!clear_page_dirty_for_io(page)) {

829

if (!clear_page_dirty_for_io(page)) {

830

dout("%p !clear_page_dirty_for_io\n", page);

830

dout("%p !clear_page_dirty_for_io\n", page);

831

unlock_page(page);

831

unlock_page(page);

832

break;

832

break;

833

}

833

}

834

835

/*

835

/*

836

* We have something to write. If this is

836

* We have something to write. If this is

837

* the first locked page this time through,

837

* the first locked page this time through,

838

* allocate an osd request and a page array

838

* allocate an osd request and a page array

839

* that it will use.

839

* that it will use.

840

*/

840

*/

841

if (locked_pages == 0) {

841

if (locked_pages == 0) {

842

BUG_ON(pages);

842

BUG_ON(pages);

843

/* prepare async write request */

843

/* prepare async write request */

844

offset = (u64)page_offset(page);

844

offset = (u64)page_offset(page);

845

len = wsize;

845

len = wsize;

846

req = ceph_osdc_new_request(&fsc->client->osdc,

846

req = ceph_osdc_new_request(&fsc->client->osdc,

847

&ci->i_layout, vino,

847

&ci->i_layout, vino,

848

offset, &len, num_ops,

848

offset, &len, num_ops,

849

CEPH_OSD_OP_WRITE,

849

CEPH_OSD_OP_WRITE,

850

CEPH_OSD_FLAG_WRITE |

850

CEPH_OSD_FLAG_WRITE |

851

CEPH_OSD_FLAG_ONDISK,

851

CEPH_OSD_FLAG_ONDISK,

852

snapc, truncate_seq,

852

snapc, truncate_seq,

853

truncate_size, true);

853

truncate_size, true);

854

if (IS_ERR(req)) {

854

if (IS_ERR(req)) {

855

rc = PTR_ERR(req);

855

rc = PTR_ERR(req);

856

unlock_page(page);

856

unlock_page(page);

857

break;

857

break;

858

}

858

}

859

860

req->r_callback = writepages_finish;

860

req->r_callback = writepages_finish;

861

req->r_inode = inode;

861

req->r_inode = inode;

862

863

max_pages = calc_pages_for(0, (u64)len);

863

max_pages = calc_pages_for(0, (u64)len);

864

pages = kmalloc(max_pages * sizeof (*pages),

864

pages = kmalloc(max_pages * sizeof (*pages),

865

GFP_NOFS);

865

GFP_NOFS);

866

if (!pages) {

866

if (!pages) {

867

pool = fsc->wb_pagevec_pool;

867

pool = fsc->wb_pagevec_pool;

868

pages = mempool_alloc(pool, GFP_NOFS);

868

pages = mempool_alloc(pool, GFP_NOFS);

869

BUG_ON(!pages);

869

BUG_ON(!pages);

870

}

870

}

871

}

871

}

872

873

/* note position of first page in pvec */

873

/* note position of first page in pvec */

874

if (first < 0)

874

if (first < 0)

875

first = i;

875

first = i;

876

dout("%p will write page %p idx %lu\n",

876

dout("%p will write page %p idx %lu\n",

877

inode, page, page->index);

877

inode, page, page->index);

878

879

writeback_stat =

879

writeback_stat =

880

atomic_long_inc_return(&fsc->writeback_count);

880

atomic_long_inc_return(&fsc->writeback_count);

881

if (writeback_stat > CONGESTION_ON_THRESH(

881

if (writeback_stat > CONGESTION_ON_THRESH(

882

fsc->mount_options->congestion_kb)) {

882

fsc->mount_options->congestion_kb)) {

883

set_bdi_congested(&fsc->backing_dev_info,

883

set_bdi_congested(&fsc->backing_dev_info,

884

BLK_RW_ASYNC);

884

BLK_RW_ASYNC);

885

}

885

}

886

887

set_page_writeback(page);

887

set_page_writeback(page);

888

pages[locked_pages] = page;

888

pages[locked_pages] = page;

889

locked_pages++;

889

locked_pages++;

890

next = page->index + 1;

890

next = page->index + 1;

891

}

891

}

892

893

/* did we get anything? */

893

/* did we get anything? */

894

if (!locked_pages)

894

if (!locked_pages)

895

goto release_pvec_pages;

895

goto release_pvec_pages;

896

if (i) {

896

if (i) {

897

int j;

897

int j;

898

BUG_ON(!locked_pages || first < 0);

898

BUG_ON(!locked_pages || first < 0);

899

900

if (pvec_pages && i == pvec_pages &&

900

if (pvec_pages && i == pvec_pages &&

901

locked_pages < max_pages) {

901

locked_pages < max_pages) {

902

dout("reached end pvec, trying for more\n");

902

dout("reached end pvec, trying for more\n");

903

pagevec_reinit(&pvec);

903

pagevec_reinit(&pvec);

904

goto get_more_pages;

904

goto get_more_pages;

905

}

905

}

906

907

/* shift unused pages over in the pvec... we

907

/* shift unused pages over in the pvec... we

908

* will need to release them below. */

908

* will need to release them below. */

909

for (j = i; j < pvec_pages; j++) {

909

for (j = i; j < pvec_pages; j++) {

910

dout(" pvec leftover page %p\n",

910

dout(" pvec leftover page %p\n",

911

pvec.pages[j]);

911

pvec.pages[j]);

912

pvec.pages[j-i+first] = pvec.pages[j];

912

pvec.pages[j-i+first] = pvec.pages[j];

913

}

913

}

914

pvec.nr -= i-first;

914

pvec.nr -= i-first;

915

}

915

}

916

917

/* Format the osd request message and submit the write */

917

/* Format the osd request message and submit the write */

918

919

offset = page_offset(pages[0]);

919

offset = page_offset(pages[0]);

920

len = min(snap_size - offset,

920

len = min(snap_size - offset,

921

(u64)locked_pages << PAGE_CACHE_SHIFT);

921

(u64)locked_pages << PAGE_CACHE_SHIFT);

922

dout("writepages got %d pages at %llu~%llu\n",

922

dout("writepages got %d pages at %llu~%llu\n",

923

locked_pages, offset, len);

923

locked_pages, offset, len);

924

925

osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,

925

osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,

926

!!pool, false);

926

!!pool, false);

927

928

pages = NULL; /* request message now owns the pages array */

928

pages = NULL; /* request message now owns the pages array */

929

pool = NULL;

929

pool = NULL;

930

931

/* Update the write op length in case we changed it */

931

/* Update the write op length in case we changed it */

932

933

osd_req_op_extent_update(req, 0, len);

933

osd_req_op_extent_update(req, 0, len);

934

935

vino = ceph_vino(inode);

935

vino = ceph_vino(inode);

936

ceph_osdc_build_request(req, offset, snapc, vino.snap,

936

ceph_osdc_build_request(req, offset, snapc, vino.snap,

937

&inode->i_mtime);

937

&inode->i_mtime);

938

939

rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);

939

rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);

940

BUG_ON(rc);

940

BUG_ON(rc);

941

req = NULL;

941

req = NULL;

942

943

/* continue? */

943

/* continue? */

944

index = next;

944

index = next;

945

wbc->nr_to_write -= locked_pages;

945

wbc->nr_to_write -= locked_pages;

946

if (wbc->nr_to_write <= 0)

946

if (wbc->nr_to_write <= 0)

947

done = 1;

947

done = 1;

948

949

release_pvec_pages:

949

release_pvec_pages:

950

dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,

950

dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,

951

pvec.nr ? pvec.pages[0] : NULL);

951

pvec.nr ? pvec.pages[0] : NULL);

952

pagevec_release(&pvec);

952

pagevec_release(&pvec);

953

954

if (locked_pages && !done)

954

if (locked_pages && !done)

955

goto retry;

955

goto retry;

956

}

956

}

957

958

if (should_loop && !done) {

958

if (should_loop && !done) {

959

/* more to do; loop back to beginning of file */

959

/* more to do; loop back to beginning of file */

960

dout("writepages looping back to beginning of file\n");

960

dout("writepages looping back to beginning of file\n");

961

should_loop = 0;

961

should_loop = 0;

962

index = 0;

962

index = 0;

963

goto retry;

963

goto retry;

964

}

964

}

965

966

if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))

966

if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))

967

mapping->writeback_index = index;

967

mapping->writeback_index = index;

968

969

out:

969

out:

970

if (req)

970

if (req)

971

ceph_osdc_put_request(req);

971

ceph_osdc_put_request(req);

972

ceph_put_snap_context(snapc);

972

ceph_put_snap_context(snapc);

973

dout("writepages done, rc = %d\n", rc);

973

dout("writepages done, rc = %d\n", rc);

974

return rc;

974

return rc;

975

}

975

}

976

977

978

979

/*

979

/*

980

* See if a given @snapc is either writeable, or already written.

980

* See if a given @snapc is either writeable, or already written.

981

*/

981

*/

982

static int context_is_writeable_or_written(struct inode *inode,

982

static int context_is_writeable_or_written(struct inode *inode,

983

struct ceph_snap_context *snapc)

983

struct ceph_snap_context *snapc)

984

{

984

{

985

struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);

985

struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);

986

int ret = !oldest || snapc->seq <= oldest->seq;

986

int ret = !oldest || snapc->seq <= oldest->seq;

987

988

ceph_put_snap_context(oldest);

988

ceph_put_snap_context(oldest);

989

return ret;

989

return ret;

990

}

990

}

991

992

/*

992

/*

993

* We are only allowed to write into/dirty the page if the page is

993

* We are only allowed to write into/dirty the page if the page is

994

* clean, or already dirty within the same snap context.

994

* clean, or already dirty within the same snap context.

995

*

995

*

996

* called with page locked.

996

* called with page locked.

997

* return success with page locked,

997

* return success with page locked,

998

* or any failure (incl -EAGAIN) with page unlocked.

998

* or any failure (incl -EAGAIN) with page unlocked.

999

*/

999

*/

1000

static int ceph_update_writeable_page(struct file *file,

1000

static int ceph_update_writeable_page(struct file *file,

1001

loff_t pos, unsigned len,

1001

loff_t pos, unsigned len,

1002

struct page *page)

1002

struct page *page)

1003

{

1003

{

1004

struct inode *inode = file_inode(file);

1004

struct inode *inode = file_inode(file);

1005

struct ceph_inode_info *ci = ceph_inode(inode);

1005

struct ceph_inode_info *ci = ceph_inode(inode);

1006

struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;

1006

struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;

1007

loff_t page_off = pos & PAGE_CACHE_MASK;

1007

loff_t page_off = pos & PAGE_CACHE_MASK;

1008

int pos_in_page = pos & ~PAGE_CACHE_MASK;

1008

int pos_in_page = pos & ~PAGE_CACHE_MASK;

1009

int end_in_page = pos_in_page + len;

1009

int end_in_page = pos_in_page + len;

1010

loff_t i_size;

1010

loff_t i_size;

1011

int r;

1011

int r;

1012

struct ceph_snap_context *snapc, *oldest;

1012

struct ceph_snap_context *snapc, *oldest;

1013

1014

retry_locked:

1014

retry_locked:

1015

/* writepages currently holds page lock, but if we change that later, */

1015

/* writepages currently holds page lock, but if we change that later, */

1016

wait_on_page_writeback(page);

1016

wait_on_page_writeback(page);

1017

1018

/* check snap context */

1018

/* check snap context */

1019

BUG_ON(!ci->i_snap_realm);

1019

BUG_ON(!ci->i_snap_realm);

1020

down_read(&mdsc->snap_rwsem);

1020

down_read(&mdsc->snap_rwsem);

1021

BUG_ON(!ci->i_snap_realm->cached_context);

1021

BUG_ON(!ci->i_snap_realm->cached_context);

1022

snapc = page_snap_context(page);

1022

snapc = page_snap_context(page);

1023

if (snapc && snapc != ci->i_head_snapc) {

1023

if (snapc && snapc != ci->i_head_snapc) {

1024

/*

1024

/*

1025

* this page is already dirty in another (older) snap

1025

* this page is already dirty in another (older) snap

1026

* context! is it writeable now?

1026

* context! is it writeable now?

1027

*/

1027

*/

1028

oldest = get_oldest_context(inode, NULL);

1028

oldest = get_oldest_context(inode, NULL);

1029

up_read(&mdsc->snap_rwsem);

1029

up_read(&mdsc->snap_rwsem);

1030

1031

if (snapc->seq > oldest->seq) {

1031

if (snapc->seq > oldest->seq) {

1032

ceph_put_snap_context(oldest);

1032

ceph_put_snap_context(oldest);

1033

dout(" page %p snapc %p not current or oldest\n",

1033

dout(" page %p snapc %p not current or oldest\n",

1034

page, snapc);

1034

page, snapc);

1035

/*

1035

/*

1036

* queue for writeback, and wait for snapc to

1036

* queue for writeback, and wait for snapc to

1037

* be writeable or written

1037

* be writeable or written

1038

*/

1038

*/

1039

snapc = ceph_get_snap_context(snapc);

1039

snapc = ceph_get_snap_context(snapc);

1040

unlock_page(page);

1040

unlock_page(page);

1041

ceph_queue_writeback(inode);

1041

ceph_queue_writeback(inode);

1042

r = wait_event_interruptible(ci->i_cap_wq,

1042

r = wait_event_interruptible(ci->i_cap_wq,

1043

context_is_writeable_or_written(inode, snapc));

1043

context_is_writeable_or_written(inode, snapc));

1044

ceph_put_snap_context(snapc);

1044

ceph_put_snap_context(snapc);

1045

if (r == -ERESTARTSYS)

1045

if (r == -ERESTARTSYS)

1046

return r;

1046

return r;

1047

return -EAGAIN;

1047

return -EAGAIN;

1048

}

1048

}

1049

ceph_put_snap_context(oldest);

1049

ceph_put_snap_context(oldest);

1050

1051

/* yay, writeable, do it now (without dropping page lock) */

1051

/* yay, writeable, do it now (without dropping page lock) */

1052

dout(" page %p snapc %p not current, but oldest\n",

1052

dout(" page %p snapc %p not current, but oldest\n",

1053

page, snapc);

1053

page, snapc);

1054

if (!clear_page_dirty_for_io(page))

1054

if (!clear_page_dirty_for_io(page))

1055

goto retry_locked;

1055

goto retry_locked;

1056

r = writepage_nounlock(page, NULL);

1056

r = writepage_nounlock(page, NULL);

1057

if (r < 0)

1057

if (r < 0)

1058

goto fail_nosnap;

1058

goto fail_nosnap;

1059

goto retry_locked;

1059

goto retry_locked;

1060

}

1060

}

1061

1062

if (PageUptodate(page)) {

1062

if (PageUptodate(page)) {

1063

dout(" page %p already uptodate\n", page);

1063

dout(" page %p already uptodate\n", page);

1064

return 0;

1064

return 0;

1065

}

1065

}

1066

1067

/* full page? */

1067

/* full page? */

1068

if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)

1068

if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)

1069

return 0;

1069

return 0;

1070

1071

/* past end of file? */

1071

/* past end of file? */

1072

i_size = inode->i_size; /* caller holds i_mutex */

1072

i_size = inode->i_size; /* caller holds i_mutex */

1073

1074

if (i_size + len > inode->i_sb->s_maxbytes) {

1074

if (i_size + len > inode->i_sb->s_maxbytes) {

1075

/* file is too big */

1075

/* file is too big */

1076

r = -EINVAL;

1076

r = -EINVAL;

1077

goto fail;

1077

goto fail;

1078

}

1078

}

1079

1080

if (page_off >= i_size ||

1080

if (page_off >= i_size ||

1081

(pos_in_page == 0 && (pos+len) >= i_size &&

1081

(pos_in_page == 0 && (pos+len) >= i_size &&

1082

end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {

1082

end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {

1083

dout(" zeroing %p 0 - %d and %d - %d\n",

1083

dout(" zeroing %p 0 - %d and %d - %d\n",

1084

page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);

1084

page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);

1085

zero_user_segments(page,

1085

zero_user_segments(page,

1086

0, pos_in_page,

1086

0, pos_in_page,

1087

end_in_page, PAGE_CACHE_SIZE);

1087

end_in_page, PAGE_CACHE_SIZE);

1088

return 0;

1088

return 0;

1089

}

1089

}

1090

1091

/* we need to read it. */

1091

/* we need to read it. */

1092

up_read(&mdsc->snap_rwsem);

1092

up_read(&mdsc->snap_rwsem);

1093

r = readpage_nounlock(file, page);

1093

r = readpage_nounlock(file, page);

1094

if (r < 0)

1094

if (r < 0)

1095

goto fail_nosnap;

1095

goto fail_nosnap;

1096

goto retry_locked;

1096

goto retry_locked;

1097

1098

fail:

1098

fail:

1099

up_read(&mdsc->snap_rwsem);

1099

up_read(&mdsc->snap_rwsem);

1100

fail_nosnap:

1100

fail_nosnap:

1101

unlock_page(page);

1101

unlock_page(page);

1102

return r;

1102

return r;

1103

}

1103

}

1104

1105

/*

1105

/*

1106

* We are only allowed to write into/dirty the page if the page is

1106

* We are only allowed to write into/dirty the page if the page is

1107

* clean, or already dirty within the same snap context.

1107

* clean, or already dirty within the same snap context.

1108

*/

1108

*/

1109

static int ceph_write_begin(struct file *file, struct address_space *mapping,

1109

static int ceph_write_begin(struct file *file, struct address_space *mapping,

1110

loff_t pos, unsigned len, unsigned flags,

1110

loff_t pos, unsigned len, unsigned flags,

1111

struct page **pagep, void **fsdata)

1111

struct page **pagep, void **fsdata)

1112

{

1112

{

1113

struct inode *inode = file_inode(file);

1113

struct inode *inode = file_inode(file);

1114

struct page *page;

1114

struct page *page;

1115

pgoff_t index = pos >> PAGE_CACHE_SHIFT;

1115

pgoff_t index = pos >> PAGE_CACHE_SHIFT;

1116

int r;

1116

int r;

1117

1118

do {

1118

do {

1119

/* get a page */

1119

/* get a page */

1120

page = grab_cache_page_write_begin(mapping, index, 0);

1120

page = grab_cache_page_write_begin(mapping, index, 0);

1121

if (!page)

1121

if (!page)

1122

return -ENOMEM;

1122

return -ENOMEM;

1123

*pagep = page;

1123

*pagep = page;

1124

1125

dout("write_begin file %p inode %p page %p %d~%d\n", file,

1125

dout("write_begin file %p inode %p page %p %d~%d\n", file,

1126

inode, page, (int)pos, (int)len);

1126

inode, page, (int)pos, (int)len);

1127

1128

r = ceph_update_writeable_page(file, pos, len, page);

1128

r = ceph_update_writeable_page(file, pos, len, page);

1129

} while (r == -EAGAIN);

1129

} while (r == -EAGAIN);

1130

1131

return r;

1131

return r;

1132

}

1132

}

1133

1134

/*

1134

/*

1135

* we don't do anything in here that simple_write_end doesn't do

1135

* we don't do anything in here that simple_write_end doesn't do

1136

* except adjust dirty page accounting and drop read lock on

1136

* except adjust dirty page accounting and drop read lock on

1137

* mdsc->snap_rwsem.

1137

* mdsc->snap_rwsem.

1138

*/

1138

*/

1139

static int ceph_write_end(struct file *file, struct address_space *mapping,

1139

static int ceph_write_end(struct file *file, struct address_space *mapping,

1140

loff_t pos, unsigned len, unsigned copied,

1140

loff_t pos, unsigned len, unsigned copied,

1141

struct page *page, void *fsdata)

1141

struct page *page, void *fsdata)

1142

{

1142

{

1143

struct inode *inode = file_inode(file);

1143

struct inode *inode = file_inode(file);

1144

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

1144

struct ceph_fs_client *fsc = ceph_inode_to_client(inode);

1145

struct ceph_mds_client *mdsc = fsc->mdsc;

1145

struct ceph_mds_client *mdsc = fsc->mdsc;

1146

unsigned from = pos & (PAGE_CACHE_SIZE - 1);

1146

unsigned from = pos & (PAGE_CACHE_SIZE - 1);

1147

int check_cap = 0;

1147

int check_cap = 0;

1148

1149

dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,

1149

dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,

1150

inode, page, (int)pos, (int)copied, (int)len);

1150

inode, page, (int)pos, (int)copied, (int)len);

1151

1152

/* zero the stale part of the page if we did a short copy */

1152

/* zero the stale part of the page if we did a short copy */

1153

if (copied < len)

1153

if (copied < len)

1154

zero_user_segment(page, from+copied, len);

1154

zero_user_segment(page, from+copied, len);

1155

1156

/* did file size increase? */

1156

/* did file size increase? */

1157

/* (no need for i_size_read(); we caller holds i_mutex */

1157

/* (no need for i_size_read(); we caller holds i_mutex */

1158

if (pos+copied > inode->i_size)

1158

if (pos+copied > inode->i_size)

1159

check_cap = ceph_inode_set_size(inode, pos+copied);

1159

check_cap = ceph_inode_set_size(inode, pos+copied);

1160

1161

if (!PageUptodate(page))

1161

if (!PageUptodate(page))

1162

SetPageUptodate(page);

1162

SetPageUptodate(page);

1163

1164

set_page_dirty(page);

1164

set_page_dirty(page);

1165

1166

unlock_page(page);

1166

unlock_page(page);

1167

up_read(&mdsc->snap_rwsem);

1167

up_read(&mdsc->snap_rwsem);

1168

page_cache_release(page);

1168

page_cache_release(page);

1169

1170

if (check_cap)

1170

if (check_cap)

1171

ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);

1171

ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);

1172

1173

return copied;

1173

return copied;

1174

}

1174

}

1175

1176

/*

1176

/*

1177

* we set .direct_IO to indicate direct io is supported, but since we

1177

* we set .direct_IO to indicate direct io is supported, but since we

1178

* intercept O_DIRECT reads and writes early, this function should

1178

* intercept O_DIRECT reads and writes early, this function should

1179

* never get called.

1179

* never get called.

1180

*/

1180

*/

1181

static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,

1181

static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,

1182

const struct iovec *iov,

1182

const struct iovec *iov,

1183

loff_t pos, unsigned long nr_segs)

1183

loff_t pos, unsigned long nr_segs)

1184

{

1184

{

1185

WARN_ON(1);

1185

WARN_ON(1);

1186

return -EINVAL;

1186

return -EINVAL;

1187

}

1187

}

1188

1189

const struct address_space_operations ceph_aops = {

1189

const struct address_space_operations ceph_aops = {

1190

.readpage = ceph_readpage,

1190

.readpage = ceph_readpage,

1191

.readpages = ceph_readpages,

1191

.readpages = ceph_readpages,

1192

.writepage = ceph_writepage,

1192

.writepage = ceph_writepage,

1193

.writepages = ceph_writepages_start,

1193

.writepages = ceph_writepages_start,

1194

.write_begin = ceph_write_begin,

1194

.write_begin = ceph_write_begin,

1195

.write_end = ceph_write_end,

1195

.write_end = ceph_write_end,

1196

.set_page_dirty = ceph_set_page_dirty,

1196

.set_page_dirty = ceph_set_page_dirty,

1197

.invalidatepage = ceph_invalidatepage,

1197

.invalidatepage = ceph_invalidatepage,

1198

.releasepage = ceph_releasepage,

1198

.releasepage = ceph_releasepage,

1199

.direct_IO = ceph_direct_io,

1199

.direct_IO = ceph_direct_io,

1200

};

1200

};

1201

1202

1203

/*

1203

/*

1204

* vm ops

1204

* vm ops

1205

*/

1205

*/

1206

1207

/*

1207

/*

1208

* Reuse write_begin here for simplicity.

1208

* Reuse write_begin here for simplicity.

1209

*/

1209

*/

1210

static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)

1210

static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)

1211

{

1211

{

1212

struct inode *inode = file_inode(vma->vm_file);

1212

struct inode *inode = file_inode(vma->vm_file);

1213

struct page *page = vmf->page;

1213

struct page *page = vmf->page;

1214

struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;

1214

struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;

1215

loff_t off = page_offset(page);

1215

loff_t off = page_offset(page);

1216

loff_t size, len;

1216

loff_t size, len;

1217

int ret;

1217

int ret;

1218

1219

/* Update time before taking page lock */

1219

/* Update time before taking page lock */

1220

file_update_time(vma->vm_file);

1220

file_update_time(vma->vm_file);

1221

1222

size = i_size_read(inode);

1222

size = i_size_read(inode);

1223

if (off + PAGE_CACHE_SIZE <= size)

1223

if (off + PAGE_CACHE_SIZE <= size)

1224

len = PAGE_CACHE_SIZE;

1224

len = PAGE_CACHE_SIZE;

1225

else

1225

else

1226

len = size & ~PAGE_CACHE_MASK;

1226

len = size & ~PAGE_CACHE_MASK;

1227

1228

dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,

1228

dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,

1229

off, len, page, page->index);

1229

off, len, page, page->index);

1230

1231

lock_page(page);

1231

lock_page(page);

1232

1233

ret = VM_FAULT_NOPAGE;

1233

ret = VM_FAULT_NOPAGE;

1234

if ((off > size) ||

1234

if ((off > size) ||

1235

(page->mapping != inode->i_mapping))

1235

(page->mapping != inode->i_mapping))

1236

goto out;

1236

goto out;

1237

1238

ret = ceph_update_writeable_page(vma->vm_file, off, len, page);

1238

ret = ceph_update_writeable_page(vma->vm_file, off, len, page);

1239

if (ret == 0) {

1239

if (ret == 0) {

1240

/* success. we'll keep the page locked. */

1240

/* success. we'll keep the page locked. */

1241

set_page_dirty(page);

1241

set_page_dirty(page);

1242

up_read(&mdsc->snap_rwsem);

1242

up_read(&mdsc->snap_rwsem);

1243

ret = VM_FAULT_LOCKED;

1243

ret = VM_FAULT_LOCKED;

1244

} else {

1244

} else {

1245

if (ret == -ENOMEM)

1245

if (ret == -ENOMEM)

1246

ret = VM_FAULT_OOM;

1246

ret = VM_FAULT_OOM;

1247

else

1247

else

1248

ret = VM_FAULT_SIGBUS;

1248

ret = VM_FAULT_SIGBUS;

1249

}

1249

}

1250

out:

1250

out:

1251

dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);

1251

dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);

1252

if (ret != VM_FAULT_LOCKED)

1252

if (ret != VM_FAULT_LOCKED)

1253

unlock_page(page);

1253

unlock_page(page);

1254

return ret;

1254

return ret;

1255

}

1255

}

1256

1257

static struct vm_operations_struct ceph_vmops = {

1257

static struct vm_operations_struct ceph_vmops = {

1258

.fault = filemap_fault,

1258

.fault = filemap_fault,

1259

.page_mkwrite = ceph_page_mkwrite,

1259

.page_mkwrite = ceph_page_mkwrite,

1260

.remap_pages = generic_file_remap_pages,

1260

.remap_pages = generic_file_remap_pages,

1261

};

1261

};

1262

1263

int ceph_mmap(struct file *file, struct vm_area_struct *vma)

1263

int ceph_mmap(struct file *file, struct vm_area_struct *vma)

1264

{

1264

{

1265

struct address_space *mapping = file->f_mapping;

1265

struct address_space *mapping = file->f_mapping;

1266

1267

if (!mapping->a_ops->readpage)

1267

if (!mapping->a_ops->readpage)

1268

return -ENOEXEC;

1268

return -ENOEXEC;

1269

file_accessed(file);

1269

file_accessed(file);

1270

vma->vm_ops = &ceph_vmops;

1270

vma->vm_ops = &ceph_vmops;

1271

return 0;

1271

return 0;

1272

}

1272

}

1273

GITLAB

ceph: allocate non-zero page to fscache in readpage()