Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* An async IO implementation for Linux

2

* An async IO implementation for Linux

3

* Written by Benjamin LaHaise <bcrl@kvack.org>

3

* Written by Benjamin LaHaise <bcrl@kvack.org>

4

*

4

*

5

* Implements an efficient asynchronous io interface.

5

* Implements an efficient asynchronous io interface.

6

*

6

*

7

8

*

8

*

9

* See ../COPYING for licensing terms.

9

* See ../COPYING for licensing terms.

10

*/

10

*/

11

#define pr_fmt(fmt) "%s: " fmt, __func__

11

#define pr_fmt(fmt) "%s: " fmt, __func__

12

13

#include <linux/kernel.h>

13

#include <linux/kernel.h>

14

#include <linux/init.h>

14

#include <linux/init.h>

15

#include <linux/errno.h>

15

#include <linux/errno.h>

16

#include <linux/time.h>

16

#include <linux/time.h>

17

#include <linux/aio_abi.h>

17

#include <linux/aio_abi.h>

18

#include <linux/export.h>

18

#include <linux/export.h>

19

#include <linux/syscalls.h>

19

#include <linux/syscalls.h>

20

#include <linux/backing-dev.h>

20

#include <linux/backing-dev.h>

21

#include <linux/uio.h>

21

#include <linux/uio.h>

22

23

#include <linux/sched.h>

23

#include <linux/sched.h>

24

#include <linux/fs.h>

24

#include <linux/fs.h>

25

#include <linux/file.h>

25

#include <linux/file.h>

26

#include <linux/mm.h>

26

#include <linux/mm.h>

27

#include <linux/mman.h>

27

#include <linux/mman.h>

28

#include <linux/mmu_context.h>

28

#include <linux/mmu_context.h>

29

#include <linux/percpu.h>

29

#include <linux/percpu.h>

30

#include <linux/slab.h>

30

#include <linux/slab.h>

31

#include <linux/timer.h>

31

#include <linux/timer.h>

32

#include <linux/aio.h>

32

#include <linux/aio.h>

33

#include <linux/highmem.h>

33

#include <linux/highmem.h>

34

#include <linux/workqueue.h>

34

#include <linux/workqueue.h>

35

#include <linux/security.h>

35

#include <linux/security.h>

36

#include <linux/eventfd.h>

36

#include <linux/eventfd.h>

37

#include <linux/blkdev.h>

37

#include <linux/blkdev.h>

38

#include <linux/compat.h>

38

#include <linux/compat.h>

39

#include <linux/migrate.h>

39

#include <linux/migrate.h>

40

#include <linux/ramfs.h>

40

#include <linux/ramfs.h>

41

#include <linux/percpu-refcount.h>

41

#include <linux/percpu-refcount.h>

42

#include <linux/mount.h>

42

#include <linux/mount.h>

43

44

#include <asm/kmap_types.h>

44

#include <asm/kmap_types.h>

45

#include <asm/uaccess.h>

45

#include <asm/uaccess.h>

46

47

#include "internal.h"

47

#include "internal.h"

48

49

#define AIO_RING_MAGIC 0xa10a10a1

49

#define AIO_RING_MAGIC 0xa10a10a1

50

#define AIO_RING_COMPAT_FEATURES 1

50

#define AIO_RING_COMPAT_FEATURES 1

51

#define AIO_RING_INCOMPAT_FEATURES 0

51

#define AIO_RING_INCOMPAT_FEATURES 0

52

struct aio_ring {

52

struct aio_ring {

53

unsigned id; /* kernel internal index number */

53

unsigned id; /* kernel internal index number */

54

unsigned nr; /* number of io_events */

54

unsigned nr; /* number of io_events */

55

unsigned head; /* Written to by userland or under ring_lock

55

unsigned head; /* Written to by userland or under ring_lock

56

* mutex by aio_read_events_ring(). */

56

* mutex by aio_read_events_ring(). */

57

unsigned tail;

57

unsigned tail;

58

59

unsigned magic;

59

unsigned magic;

60

unsigned compat_features;

60

unsigned compat_features;

61

unsigned incompat_features;

61

unsigned incompat_features;

62

unsigned header_length; /* size of aio_ring */

62

unsigned header_length; /* size of aio_ring */

63

64

65

struct io_event io_events[0];

65

struct io_event io_events[0];

66

}; /* 128 bytes + ring size */

66

}; /* 128 bytes + ring size */

67

68

#define AIO_RING_PAGES 8

68

#define AIO_RING_PAGES 8

69

70

struct kioctx_table {

70

struct kioctx_table {

71

struct rcu_head rcu;

71

struct rcu_head rcu;

72

unsigned nr;

72

unsigned nr;

73

struct kioctx *table[];

73

struct kioctx *table[];

74

};

74

};

75

76

struct kioctx_cpu {

76

struct kioctx_cpu {

77

unsigned reqs_available;

77

unsigned reqs_available;

78

};

78

};

79

80

struct kioctx {

80

struct kioctx {

81

struct percpu_ref users;

81

struct percpu_ref users;

82

atomic_t dead;

82

atomic_t dead;

83

84

struct percpu_ref reqs;

84

struct percpu_ref reqs;

85

86

unsigned long user_id;

86

unsigned long user_id;

87

88

struct __percpu kioctx_cpu *cpu;

88

struct __percpu kioctx_cpu *cpu;

89

90

/*

90

/*

91

* For percpu reqs_available, number of slots we move to/from global

91

* For percpu reqs_available, number of slots we move to/from global

92

* counter at a time:

92

* counter at a time:

93

*/

93

*/

94

unsigned req_batch;

94

unsigned req_batch;

95

/*

95

/*

96

* This is what userspace passed to io_setup(), it's not used for

96

* This is what userspace passed to io_setup(), it's not used for

97

* anything but counting against the global max_reqs quota.

97

* anything but counting against the global max_reqs quota.

98

*

98

*

99

* The real limit is nr_events - 1, which will be larger (see

99

* The real limit is nr_events - 1, which will be larger (see

100

* aio_setup_ring())

100

* aio_setup_ring())

101

*/

101

*/

102

unsigned max_reqs;

102

unsigned max_reqs;

103

104

/* Size of ringbuffer, in units of struct io_event */

104

/* Size of ringbuffer, in units of struct io_event */

105

unsigned nr_events;

105

unsigned nr_events;

106

107

unsigned long mmap_base;

107

unsigned long mmap_base;

108

unsigned long mmap_size;

108

unsigned long mmap_size;

109

110

struct page **ring_pages;

110

struct page **ring_pages;

111

long nr_pages;

111

long nr_pages;

112

113

struct work_struct free_work;

113

struct work_struct free_work;

114

115

/*

115

/*

116

* signals when all in-flight requests are done

116

* signals when all in-flight requests are done

117

*/

117

*/

118

struct completion *requests_done;

118

struct completion *requests_done;

119

120

struct {

120

struct {

121

/*

121

/*

122

* This counts the number of available slots in the ringbuffer,

122

* This counts the number of available slots in the ringbuffer,

123

* so we avoid overflowing it: it's decremented (if positive)

123

* so we avoid overflowing it: it's decremented (if positive)

124

* when allocating a kiocb and incremented when the resulting

124

* when allocating a kiocb and incremented when the resulting

125

* io_event is pulled off the ringbuffer.

125

* io_event is pulled off the ringbuffer.

126

*

126

*

127

* We batch accesses to it with a percpu version.

127

* We batch accesses to it with a percpu version.

128

*/

128

*/

129

atomic_t reqs_available;

129

atomic_t reqs_available;

130

} ____cacheline_aligned_in_smp;

130

} ____cacheline_aligned_in_smp;

131

132

struct {

132

struct {

133

spinlock_t ctx_lock;

133

spinlock_t ctx_lock;

134

struct list_head active_reqs; /* used for cancellation */

134

struct list_head active_reqs; /* used for cancellation */

135

} ____cacheline_aligned_in_smp;

135

} ____cacheline_aligned_in_smp;

136

137

struct {

137

struct {

138

struct mutex ring_lock;

138

struct mutex ring_lock;

139

wait_queue_head_t wait;

139

wait_queue_head_t wait;

140

} ____cacheline_aligned_in_smp;

140

} ____cacheline_aligned_in_smp;

141

142

struct {

142

struct {

143

unsigned tail;

143

unsigned tail;

144

unsigned completed_events;

144

unsigned completed_events;

145

spinlock_t completion_lock;

145

spinlock_t completion_lock;

146

} ____cacheline_aligned_in_smp;

146

} ____cacheline_aligned_in_smp;

147

148

struct page *internal_pages[AIO_RING_PAGES];

148

struct page *internal_pages[AIO_RING_PAGES];

149

struct file *aio_ring_file;

149

struct file *aio_ring_file;

150

151

unsigned id;

151

unsigned id;

152

};

152

};

153

154

/*------ sysctl variables----*/

154

/*------ sysctl variables----*/

155

static DEFINE_SPINLOCK(aio_nr_lock);

155

static DEFINE_SPINLOCK(aio_nr_lock);

156

unsigned long aio_nr; /* current system wide number of aio requests */

156

unsigned long aio_nr; /* current system wide number of aio requests */

157

unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */

157

unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */

158

/*----end sysctl variables---*/

158

/*----end sysctl variables---*/

159

160

static struct kmem_cache *kiocb_cachep;

160

static struct kmem_cache *kiocb_cachep;

161

static struct kmem_cache *kioctx_cachep;

161

static struct kmem_cache *kioctx_cachep;

162

163

static struct vfsmount *aio_mnt;

163

static struct vfsmount *aio_mnt;

164

165

static const struct file_operations aio_ring_fops;

165

static const struct file_operations aio_ring_fops;

166

static const struct address_space_operations aio_ctx_aops;

166

static const struct address_space_operations aio_ctx_aops;

167

168

/* Backing dev info for aio fs.

168

/* Backing dev info for aio fs.

169

* -no dirty page accounting or writeback happens

169

* -no dirty page accounting or writeback happens

170

*/

170

*/

171

static struct backing_dev_info aio_fs_backing_dev_info = {

171

static struct backing_dev_info aio_fs_backing_dev_info = {

172

.name = "aiofs",

172

.name = "aiofs",

173

.state = 0,

173

.state = 0,

174

.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,

174

.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,

175

};

175

};

176

177

static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)

177

static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)

178

{

178

{

179

struct qstr this = QSTR_INIT("[aio]", 5);

179

struct qstr this = QSTR_INIT("[aio]", 5);

180

struct file *file;

180

struct file *file;

181

struct path path;

181

struct path path;

182

struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);

182

struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);

183

if (IS_ERR(inode))

183

if (IS_ERR(inode))

184

return ERR_CAST(inode);

184

return ERR_CAST(inode);

185

186

inode->i_mapping->a_ops = &aio_ctx_aops;

186

inode->i_mapping->a_ops = &aio_ctx_aops;

187

inode->i_mapping->private_data = ctx;

187

inode->i_mapping->private_data = ctx;

188

inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;

188

inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;

189

inode->i_size = PAGE_SIZE * nr_pages;

189

inode->i_size = PAGE_SIZE * nr_pages;

190

191

path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);

191

path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);

192

if (!path.dentry) {

192

if (!path.dentry) {

193

iput(inode);

193

iput(inode);

194

return ERR_PTR(-ENOMEM);

194

return ERR_PTR(-ENOMEM);

195

}

195

}

196

path.mnt = mntget(aio_mnt);

196

path.mnt = mntget(aio_mnt);

197

198

d_instantiate(path.dentry, inode);

198

d_instantiate(path.dentry, inode);

199

file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);

199

file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);

200

if (IS_ERR(file)) {

200

if (IS_ERR(file)) {

201

path_put(&path);

201

path_put(&path);

202

return file;

202

return file;

203

}

203

}

204

205

file->f_flags = O_RDWR;

205

file->f_flags = O_RDWR;

206

return file;

206

return file;

207

}

207

}

208

209

static struct dentry *aio_mount(struct file_system_type *fs_type,

209

static struct dentry *aio_mount(struct file_system_type *fs_type,

210

int flags, const char *dev_name, void *data)

210

int flags, const char *dev_name, void *data)

211

{

211

{

212

static const struct dentry_operations ops = {

212

static const struct dentry_operations ops = {

213

.d_dname = simple_dname,

213

.d_dname = simple_dname,

214

};

214

};

215

return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);

215

return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);

216

}

216

}

217

218

/* aio_setup

218

/* aio_setup

219

* Creates the slab caches used by the aio routines, panic on

219

* Creates the slab caches used by the aio routines, panic on

220

* failure as this is done early during the boot sequence.

220

* failure as this is done early during the boot sequence.

221

*/

221

*/

222

static int __init aio_setup(void)

222

static int __init aio_setup(void)

223

{

223

{

224

static struct file_system_type aio_fs = {

224

static struct file_system_type aio_fs = {

225

.name = "aio",

225

.name = "aio",

226

.mount = aio_mount,

226

.mount = aio_mount,

227

.kill_sb = kill_anon_super,

227

.kill_sb = kill_anon_super,

228

};

228

};

229

aio_mnt = kern_mount(&aio_fs);

229

aio_mnt = kern_mount(&aio_fs);

230

if (IS_ERR(aio_mnt))

230

if (IS_ERR(aio_mnt))

231

panic("Failed to create aio fs mount.");

231

panic("Failed to create aio fs mount.");

232

233

if (bdi_init(&aio_fs_backing_dev_info))

233

if (bdi_init(&aio_fs_backing_dev_info))

234

panic("Failed to init aio fs backing dev info.");

234

panic("Failed to init aio fs backing dev info.");

235

236

kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);

236

kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);

237

kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);

237

kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);

238

239

pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));

239

pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));

240

241

return 0;

241

return 0;

242

}

242

}

243

__initcall(aio_setup);

243

__initcall(aio_setup);

244

245

static void put_aio_ring_file(struct kioctx *ctx)

245

static void put_aio_ring_file(struct kioctx *ctx)

246

{

246

{

247

struct file *aio_ring_file = ctx->aio_ring_file;

247

struct file *aio_ring_file = ctx->aio_ring_file;

248

if (aio_ring_file) {

248

if (aio_ring_file) {

249

truncate_setsize(aio_ring_file->f_inode, 0);

249

truncate_setsize(aio_ring_file->f_inode, 0);

250

251

/* Prevent further access to the kioctx from migratepages */

251

/* Prevent further access to the kioctx from migratepages */

252

spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);

252

spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);

253

aio_ring_file->f_inode->i_mapping->private_data = NULL;

253

aio_ring_file->f_inode->i_mapping->private_data = NULL;

254

ctx->aio_ring_file = NULL;

254

ctx->aio_ring_file = NULL;

255

spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);

255

spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);

256

257

fput(aio_ring_file);

257

fput(aio_ring_file);

258

}

258

}

259

}

259

}

260

261

static void aio_free_ring(struct kioctx *ctx)

261

static void aio_free_ring(struct kioctx *ctx)

262

{

262

{

263

int i;

263

int i;

264

265

/* Disconnect the kiotx from the ring file. This prevents future

265

/* Disconnect the kiotx from the ring file. This prevents future

266

* accesses to the kioctx from page migration.

266

* accesses to the kioctx from page migration.

267

*/

267

*/

268

put_aio_ring_file(ctx);

268

put_aio_ring_file(ctx);

269

270

for (i = 0; i < ctx->nr_pages; i++) {

270

for (i = 0; i < ctx->nr_pages; i++) {

271

struct page *page;

271

struct page *page;

272

pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,

272

pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,

273

page_count(ctx->ring_pages[i]));

273

page_count(ctx->ring_pages[i]));

274

page = ctx->ring_pages[i];

274

page = ctx->ring_pages[i];

275

if (!page)

275

if (!page)

276

continue;

276

continue;

277

ctx->ring_pages[i] = NULL;

277

ctx->ring_pages[i] = NULL;

278

put_page(page);

278

put_page(page);

279

}

279

}

280

281

if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {

281

if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {

282

kfree(ctx->ring_pages);

282

kfree(ctx->ring_pages);

283

ctx->ring_pages = NULL;

283

ctx->ring_pages = NULL;

284

}

284

}

285

}

285

}

286

287

static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)

287

static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)

288

{

288

{

289

vma->vm_flags |= VM_DONTEXPAND;

289

vma->vm_flags |= VM_DONTEXPAND;

290

vma->vm_ops = &generic_file_vm_ops;

290

vma->vm_ops = &generic_file_vm_ops;

291

return 0;

291

return 0;

292

}

292

}

293

294

static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)

294

static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)

295

{

295

{

296

struct mm_struct *mm = vma->vm_mm;

296

struct mm_struct *mm = vma->vm_mm;

297

struct kioctx_table *table;

297

struct kioctx_table *table;

298

int i;

298

int i;

299

300

spin_lock(&mm->ioctx_lock);

300

spin_lock(&mm->ioctx_lock);

301

rcu_read_lock();

301

rcu_read_lock();

302

table = rcu_dereference(mm->ioctx_table);

302

table = rcu_dereference(mm->ioctx_table);

303

for (i = 0; i < table->nr; i++) {

303

for (i = 0; i < table->nr; i++) {

304

struct kioctx *ctx;

304

struct kioctx *ctx;

305

306

ctx = table->table[i];

306

ctx = table->table[i];

307

if (ctx && ctx->aio_ring_file == file) {

307

if (ctx && ctx->aio_ring_file == file) {

308

ctx->user_id = ctx->mmap_base = vma->vm_start;

308

ctx->user_id = ctx->mmap_base = vma->vm_start;

309

break;

309

break;

310

}

310

}

311

}

311

}

312

313

rcu_read_unlock();

313

rcu_read_unlock();

314

spin_unlock(&mm->ioctx_lock);

314

spin_unlock(&mm->ioctx_lock);

315

}

315

}

316

317

static const struct file_operations aio_ring_fops = {

317

static const struct file_operations aio_ring_fops = {

318

.mmap = aio_ring_mmap,

318

.mmap = aio_ring_mmap,

319

.mremap = aio_ring_remap,

319

.mremap = aio_ring_remap,

320

};

320

};

321

322

#if IS_ENABLED(CONFIG_MIGRATION)

322

#if IS_ENABLED(CONFIG_MIGRATION)

323

static int aio_migratepage(struct address_space *mapping, struct page *new,

323

static int aio_migratepage(struct address_space *mapping, struct page *new,

324

struct page *old, enum migrate_mode mode)

324

struct page *old, enum migrate_mode mode)

325

{

325

{

326

struct kioctx *ctx;

326

struct kioctx *ctx;

327

unsigned long flags;

327

unsigned long flags;

328

pgoff_t idx;

328

pgoff_t idx;

329

int rc;

329

int rc;

330

331

rc = 0;

331

rc = 0;

332

333

/* mapping->private_lock here protects against the kioctx teardown. */

333

/* mapping->private_lock here protects against the kioctx teardown. */

334

spin_lock(&mapping->private_lock);

334

spin_lock(&mapping->private_lock);

335

ctx = mapping->private_data;

335

ctx = mapping->private_data;

336

if (!ctx) {

336

if (!ctx) {

337

rc = -EINVAL;

337

rc = -EINVAL;

338

goto out;

338

goto out;

339

}

339

}

340

341

/* The ring_lock mutex. The prevents aio_read_events() from writing

341

/* The ring_lock mutex. The prevents aio_read_events() from writing

342

* to the ring's head, and prevents page migration from mucking in

342

* to the ring's head, and prevents page migration from mucking in

343

* a partially initialized kiotx.

343

* a partially initialized kiotx.

344

*/

344

*/

345

if (!mutex_trylock(&ctx->ring_lock)) {

345

if (!mutex_trylock(&ctx->ring_lock)) {

346

rc = -EAGAIN;

346

rc = -EAGAIN;

347

goto out;

347

goto out;

348

}

348

}

349

350

idx = old->index;

350

idx = old->index;

351

if (idx < (pgoff_t)ctx->nr_pages) {

351

if (idx < (pgoff_t)ctx->nr_pages) {

352

/* Make sure the old page hasn't already been changed */

352

/* Make sure the old page hasn't already been changed */

353

if (ctx->ring_pages[idx] != old)

353

if (ctx->ring_pages[idx] != old)

354

rc = -EAGAIN;

354

rc = -EAGAIN;

355

} else

355

} else

356

rc = -EINVAL;

356

rc = -EINVAL;

357

358

if (rc != 0)

358

if (rc != 0)

359

goto out_unlock;

359

goto out_unlock;

360

361

/* Writeback must be complete */

361

/* Writeback must be complete */

362

BUG_ON(PageWriteback(old));

362

BUG_ON(PageWriteback(old));

363

get_page(new);

363

get_page(new);

364

365

rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);

365

rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);

366

if (rc != MIGRATEPAGE_SUCCESS) {

366

if (rc != MIGRATEPAGE_SUCCESS) {

367

put_page(new);

367

put_page(new);

368

goto out_unlock;

368

goto out_unlock;

369

}

369

}

370

371

/* Take completion_lock to prevent other writes to the ring buffer

371

/* Take completion_lock to prevent other writes to the ring buffer

372

* while the old page is copied to the new. This prevents new

372

* while the old page is copied to the new. This prevents new

373

* events from being lost.

373

* events from being lost.

374

*/

374

*/

375

spin_lock_irqsave(&ctx->completion_lock, flags);

375

spin_lock_irqsave(&ctx->completion_lock, flags);

376

migrate_page_copy(new, old);

376

migrate_page_copy(new, old);

377

BUG_ON(ctx->ring_pages[idx] != old);

377

BUG_ON(ctx->ring_pages[idx] != old);

378

ctx->ring_pages[idx] = new;

378

ctx->ring_pages[idx] = new;

379

spin_unlock_irqrestore(&ctx->completion_lock, flags);

379

spin_unlock_irqrestore(&ctx->completion_lock, flags);

380

381

/* The old page is no longer accessible. */

381

/* The old page is no longer accessible. */

382

put_page(old);

382

put_page(old);

383

384

out_unlock:

384

out_unlock:

385

mutex_unlock(&ctx->ring_lock);

385

mutex_unlock(&ctx->ring_lock);

386

out:

386

out:

387

spin_unlock(&mapping->private_lock);

387

spin_unlock(&mapping->private_lock);

388

return rc;

388

return rc;

389

}

389

}

390

#endif

390

#endif

391

392

static const struct address_space_operations aio_ctx_aops = {

392

static const struct address_space_operations aio_ctx_aops = {

393

.set_page_dirty = __set_page_dirty_no_writeback,

393

.set_page_dirty = __set_page_dirty_no_writeback,

394

#if IS_ENABLED(CONFIG_MIGRATION)

394

#if IS_ENABLED(CONFIG_MIGRATION)

395

.migratepage = aio_migratepage,

395

.migratepage = aio_migratepage,

396

#endif

396

#endif

397

};

397

};

398

399

static int aio_setup_ring(struct kioctx *ctx)

399

static int aio_setup_ring(struct kioctx *ctx)

400

{

400

{

401

struct aio_ring *ring;

401

struct aio_ring *ring;

402

unsigned nr_events = ctx->max_reqs;

402

unsigned nr_events = ctx->max_reqs;

403

struct mm_struct *mm = current->mm;

403

struct mm_struct *mm = current->mm;

404

unsigned long size, unused;

404

unsigned long size, unused;

405

int nr_pages;

405

int nr_pages;

406

int i;

406

int i;

407

struct file *file;

407

struct file *file;

408

409

/* Compensate for the ring buffer's head/tail overlap entry */

409

/* Compensate for the ring buffer's head/tail overlap entry */

410

nr_events += 2; /* 1 is required, 2 for good luck */

410

nr_events += 2; /* 1 is required, 2 for good luck */

411

412

size = sizeof(struct aio_ring);

412

size = sizeof(struct aio_ring);

413

size += sizeof(struct io_event) * nr_events;

413

size += sizeof(struct io_event) * nr_events;

414

415

nr_pages = PFN_UP(size);

415

nr_pages = PFN_UP(size);

416

if (nr_pages < 0)

416

if (nr_pages < 0)

417

return -EINVAL;

417

return -EINVAL;

418

419

file = aio_private_file(ctx, nr_pages);

419

file = aio_private_file(ctx, nr_pages);

420

if (IS_ERR(file)) {

420

if (IS_ERR(file)) {

421

ctx->aio_ring_file = NULL;

421

ctx->aio_ring_file = NULL;

422

return -ENOMEM;

422

return -ENOMEM;

423

}

423

}

424

425

ctx->aio_ring_file = file;

425

ctx->aio_ring_file = file;

426

nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))

426

nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))

427

/ sizeof(struct io_event);

427

/ sizeof(struct io_event);

428

429

ctx->ring_pages = ctx->internal_pages;

429

ctx->ring_pages = ctx->internal_pages;

430

if (nr_pages > AIO_RING_PAGES) {

430

if (nr_pages > AIO_RING_PAGES) {

431

ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),

431

ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),

432

GFP_KERNEL);

432

GFP_KERNEL);

433

if (!ctx->ring_pages) {

433

if (!ctx->ring_pages) {

434

put_aio_ring_file(ctx);

434

put_aio_ring_file(ctx);

435

return -ENOMEM;

435

return -ENOMEM;

436

}

436

}

437

}

437

}

438

439

for (i = 0; i < nr_pages; i++) {

439

for (i = 0; i < nr_pages; i++) {

440

struct page *page;

440

struct page *page;

441

page = find_or_create_page(file->f_inode->i_mapping,

441

page = find_or_create_page(file->f_inode->i_mapping,

442

i, GFP_HIGHUSER | __GFP_ZERO);

442

i, GFP_HIGHUSER | __GFP_ZERO);

443

if (!page)

443

if (!page)

444

break;

444

break;

445

pr_debug("pid(%d) page[%d]->count=%d\n",

445

pr_debug("pid(%d) page[%d]->count=%d\n",

446

current->pid, i, page_count(page));

446

current->pid, i, page_count(page));

447

SetPageUptodate(page);

447

SetPageUptodate(page);

448

unlock_page(page);

448

unlock_page(page);

449

450

ctx->ring_pages[i] = page;

450

ctx->ring_pages[i] = page;

451

}

451

}

452

ctx->nr_pages = i;

452

ctx->nr_pages = i;

453

454

if (unlikely(i != nr_pages)) {

454

if (unlikely(i != nr_pages)) {

455

aio_free_ring(ctx);

455

aio_free_ring(ctx);

456

return -ENOMEM;

456

return -ENOMEM;

457

}

457

}

458

459

ctx->mmap_size = nr_pages * PAGE_SIZE;

459

ctx->mmap_size = nr_pages * PAGE_SIZE;

460

pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);

460

pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);

461

462

down_write(&mm->mmap_sem);

462

down_write(&mm->mmap_sem);

463

ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,

463

ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,

464

PROT_READ | PROT_WRITE,

464

PROT_READ | PROT_WRITE,

465

MAP_SHARED, 0, &unused);

465

MAP_SHARED, 0, &unused);

466

up_write(&mm->mmap_sem);

466

up_write(&mm->mmap_sem);

467

if (IS_ERR((void *)ctx->mmap_base)) {

467

if (IS_ERR((void *)ctx->mmap_base)) {

468

ctx->mmap_size = 0;

468

ctx->mmap_size = 0;

469

aio_free_ring(ctx);

469

aio_free_ring(ctx);

470

return -ENOMEM;

470

return -ENOMEM;

471

}

471

}

472

473

pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);

473

pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);

474

475

ctx->user_id = ctx->mmap_base;

475

ctx->user_id = ctx->mmap_base;

476

ctx->nr_events = nr_events; /* trusted copy */

476

ctx->nr_events = nr_events; /* trusted copy */

477

478

ring = kmap_atomic(ctx->ring_pages[0]);

478

ring = kmap_atomic(ctx->ring_pages[0]);

479

ring->nr = nr_events; /* user copy */

479

ring->nr = nr_events; /* user copy */

480

ring->id = ~0U;

480

ring->id = ~0U;

481

ring->head = ring->tail = 0;

481

ring->head = ring->tail = 0;

482

ring->magic = AIO_RING_MAGIC;

482

ring->magic = AIO_RING_MAGIC;

483

ring->compat_features = AIO_RING_COMPAT_FEATURES;

483

ring->compat_features = AIO_RING_COMPAT_FEATURES;

484

ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;

484

ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;

485

ring->header_length = sizeof(struct aio_ring);

485

ring->header_length = sizeof(struct aio_ring);

486

kunmap_atomic(ring);

486

kunmap_atomic(ring);

487

flush_dcache_page(ctx->ring_pages[0]);

487

flush_dcache_page(ctx->ring_pages[0]);

488

489

return 0;

489

return 0;

490

}

490

}

491

492

#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))

492

#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))

493

#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))

493

#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))

494

#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)

494

#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)

495

496

void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)

496

void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)

497

{

497

{

498

struct kioctx *ctx = req->ki_ctx;

498

struct kioctx *ctx = req->ki_ctx;

499

unsigned long flags;

499

unsigned long flags;

500

501

spin_lock_irqsave(&ctx->ctx_lock, flags);

501

spin_lock_irqsave(&ctx->ctx_lock, flags);

502

503

if (!req->ki_list.next)

503

if (!req->ki_list.next)

504

list_add(&req->ki_list, &ctx->active_reqs);

504

list_add(&req->ki_list, &ctx->active_reqs);

505

506

req->ki_cancel = cancel;

506

req->ki_cancel = cancel;

507

508

spin_unlock_irqrestore(&ctx->ctx_lock, flags);

508

spin_unlock_irqrestore(&ctx->ctx_lock, flags);

509

}

509

}

510

EXPORT_SYMBOL(kiocb_set_cancel_fn);

510

EXPORT_SYMBOL(kiocb_set_cancel_fn);

511

512

static int kiocb_cancel(struct kiocb *kiocb)

512

static int kiocb_cancel(struct kiocb *kiocb)

513

{

513

{

514

kiocb_cancel_fn *old, *cancel;

514

kiocb_cancel_fn *old, *cancel;

515

516

/*

516

/*

517

* Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it

517

* Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it

518

* actually has a cancel function, hence the cmpxchg()

518

* actually has a cancel function, hence the cmpxchg()

519

*/

519

*/

520

521

cancel = ACCESS_ONCE(kiocb->ki_cancel);

521

cancel = ACCESS_ONCE(kiocb->ki_cancel);

522

do {

522

do {

523

if (!cancel || cancel == KIOCB_CANCELLED)

523

if (!cancel || cancel == KIOCB_CANCELLED)

524

return -EINVAL;

524

return -EINVAL;

525

526

old = cancel;

526

old = cancel;

527

cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);

527

cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);

528

} while (cancel != old);

528

} while (cancel != old);

529

530

return cancel(kiocb);

530

return cancel(kiocb);

531

}

531

}

532

533

static void free_ioctx(struct work_struct *work)

533

static void free_ioctx(struct work_struct *work)

534

{

534

{

535

struct kioctx *ctx = container_of(work, struct kioctx, free_work);

535

struct kioctx *ctx = container_of(work, struct kioctx, free_work);

536

537

pr_debug("freeing %p\n", ctx);

537

pr_debug("freeing %p\n", ctx);

538

539

aio_free_ring(ctx);

539

aio_free_ring(ctx);

540

free_percpu(ctx->cpu);

540

free_percpu(ctx->cpu);

541

percpu_ref_exit(&ctx->reqs);

541

percpu_ref_exit(&ctx->reqs);

542

percpu_ref_exit(&ctx->users);

542

percpu_ref_exit(&ctx->users);

543

kmem_cache_free(kioctx_cachep, ctx);

543

kmem_cache_free(kioctx_cachep, ctx);

544

}

544

}

545

546

static void free_ioctx_reqs(struct percpu_ref *ref)

546

static void free_ioctx_reqs(struct percpu_ref *ref)

547

{

547

{

548

struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

548

struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

549

550

/* At this point we know that there are no any in-flight requests */

550

/* At this point we know that there are no any in-flight requests */

551

if (ctx->requests_done)

551

if (ctx->requests_done)

552

complete(ctx->requests_done);

552

complete(ctx->requests_done);

553

554

INIT_WORK(&ctx->free_work, free_ioctx);

554

INIT_WORK(&ctx->free_work, free_ioctx);

555

schedule_work(&ctx->free_work);

555

schedule_work(&ctx->free_work);

556

}

556

}

557

558

/*

558

/*

559

* When this function runs, the kioctx has been removed from the "hash table"

559

* When this function runs, the kioctx has been removed from the "hash table"

560

* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -

560

* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -

561

* now it's safe to cancel any that need to be.

561

* now it's safe to cancel any that need to be.

562

*/

562

*/

563

static void free_ioctx_users(struct percpu_ref *ref)

563

static void free_ioctx_users(struct percpu_ref *ref)

564

{

564

{

565

struct kioctx *ctx = container_of(ref, struct kioctx, users);

565

struct kioctx *ctx = container_of(ref, struct kioctx, users);

566

struct kiocb *req;

566

struct kiocb *req;

567

568

spin_lock_irq(&ctx->ctx_lock);

568

spin_lock_irq(&ctx->ctx_lock);

569

570

while (!list_empty(&ctx->active_reqs)) {

570

while (!list_empty(&ctx->active_reqs)) {

571

req = list_first_entry(&ctx->active_reqs,

571

req = list_first_entry(&ctx->active_reqs,

572

struct kiocb, ki_list);

572

struct kiocb, ki_list);

573

574

list_del_init(&req->ki_list);

574

list_del_init(&req->ki_list);

575

kiocb_cancel(req);

575

kiocb_cancel(req);

576

}

576

}

577

578

spin_unlock_irq(&ctx->ctx_lock);

578

spin_unlock_irq(&ctx->ctx_lock);

579

580

percpu_ref_kill(&ctx->reqs);

580

percpu_ref_kill(&ctx->reqs);

581

percpu_ref_put(&ctx->reqs);

581

percpu_ref_put(&ctx->reqs);

582

}

582

}

583

584

static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)

584

static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)

585

{

585

{

586

unsigned i, new_nr;

586

unsigned i, new_nr;

587

struct kioctx_table *table, *old;

587

struct kioctx_table *table, *old;

588

struct aio_ring *ring;

588

struct aio_ring *ring;

589

590

spin_lock(&mm->ioctx_lock);

590

spin_lock(&mm->ioctx_lock);

591

table = rcu_dereference_raw(mm->ioctx_table);

591

table = rcu_dereference_raw(mm->ioctx_table);

592

593

while (1) {

593

while (1) {

594

if (table)

594

if (table)

595

for (i = 0; i < table->nr; i++)

595

for (i = 0; i < table->nr; i++)

596

if (!table->table[i]) {

596

if (!table->table[i]) {

597

ctx->id = i;

597

ctx->id = i;

598

table->table[i] = ctx;

598

table->table[i] = ctx;

599

spin_unlock(&mm->ioctx_lock);

599

spin_unlock(&mm->ioctx_lock);

600

601

/* While kioctx setup is in progress,

601

/* While kioctx setup is in progress,

602

* we are protected from page migration

602

* we are protected from page migration

603

* changes ring_pages by ->ring_lock.

603

* changes ring_pages by ->ring_lock.

604

*/

604

*/

605

ring = kmap_atomic(ctx->ring_pages[0]);

605

ring = kmap_atomic(ctx->ring_pages[0]);

606

ring->id = ctx->id;

606

ring->id = ctx->id;

607

kunmap_atomic(ring);

607

kunmap_atomic(ring);

608

return 0;

608

return 0;

609

}

609

}

610

611

new_nr = (table ? table->nr : 1) * 4;

611

new_nr = (table ? table->nr : 1) * 4;

612

spin_unlock(&mm->ioctx_lock);

612

spin_unlock(&mm->ioctx_lock);

613

614

table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *

614

table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *

615

new_nr, GFP_KERNEL);

615

new_nr, GFP_KERNEL);

616

if (!table)

616

if (!table)

617

return -ENOMEM;

617

return -ENOMEM;

618

619

table->nr = new_nr;

619

table->nr = new_nr;

620

621

spin_lock(&mm->ioctx_lock);

621

spin_lock(&mm->ioctx_lock);

622

old = rcu_dereference_raw(mm->ioctx_table);

622

old = rcu_dereference_raw(mm->ioctx_table);

623

624

if (!old) {

624

if (!old) {

625

rcu_assign_pointer(mm->ioctx_table, table);

625

rcu_assign_pointer(mm->ioctx_table, table);

626

} else if (table->nr > old->nr) {

626

} else if (table->nr > old->nr) {

627

memcpy(table->table, old->table,

627

memcpy(table->table, old->table,

628

old->nr * sizeof(struct kioctx *));

628

old->nr * sizeof(struct kioctx *));

629

630

rcu_assign_pointer(mm->ioctx_table, table);

630

rcu_assign_pointer(mm->ioctx_table, table);

631

kfree_rcu(old, rcu);

631

kfree_rcu(old, rcu);

632

} else {

632

} else {

633

kfree(table);

633

kfree(table);

634

table = old;

634

table = old;

635

}

635

}

636

}

636

}

637

}

637

}

638

639

static void aio_nr_sub(unsigned nr)

639

static void aio_nr_sub(unsigned nr)

640

{

640

{

641

spin_lock(&aio_nr_lock);

641

spin_lock(&aio_nr_lock);

642

if (WARN_ON(aio_nr - nr > aio_nr))

642

if (WARN_ON(aio_nr - nr > aio_nr))

643

aio_nr = 0;

643

aio_nr = 0;

644

else

644

else

645

aio_nr -= nr;

645

aio_nr -= nr;

646

spin_unlock(&aio_nr_lock);

646

spin_unlock(&aio_nr_lock);

647

}

647

}

648

649

/* ioctx_alloc

649

/* ioctx_alloc

650

* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.

650

* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.

651

*/

651

*/

652

static struct kioctx *ioctx_alloc(unsigned nr_events)

652

static struct kioctx *ioctx_alloc(unsigned nr_events)

653

{

653

{

654

struct mm_struct *mm = current->mm;

654

struct mm_struct *mm = current->mm;

655

struct kioctx *ctx;

655

struct kioctx *ctx;

656

int err = -ENOMEM;

656

int err = -ENOMEM;

657

658

/*

658

/*

659

* We keep track of the number of available ringbuffer slots, to prevent

659

* We keep track of the number of available ringbuffer slots, to prevent

660

* overflow (reqs_available), and we also use percpu counters for this.

660

* overflow (reqs_available), and we also use percpu counters for this.

661

*

661

*

662

* So since up to half the slots might be on other cpu's percpu counters

662

* So since up to half the slots might be on other cpu's percpu counters

663

* and unavailable, double nr_events so userspace sees what they

663

* and unavailable, double nr_events so userspace sees what they

664

* expected: additionally, we move req_batch slots to/from percpu

664

* expected: additionally, we move req_batch slots to/from percpu

665

* counters at a time, so make sure that isn't 0:

665

* counters at a time, so make sure that isn't 0:

666

*/

666

*/

667

nr_events = max(nr_events, num_possible_cpus() * 4);

667

nr_events = max(nr_events, num_possible_cpus() * 4);

668

nr_events *= 2;

668

nr_events *= 2;

669

670

/* Prevent overflows */

670

/* Prevent overflows */

671

if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||

671

if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||

672

(nr_events > (0x10000000U / sizeof(struct kiocb)))) {

672

(nr_events > (0x10000000U / sizeof(struct kiocb)))) {

673

pr_debug("ENOMEM: nr_events too high\n");

673

pr_debug("ENOMEM: nr_events too high\n");

674

return ERR_PTR(-EINVAL);

674

return ERR_PTR(-EINVAL);

675

}

675

}

676

677

if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))

677

if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))

678

return ERR_PTR(-EAGAIN);

678

return ERR_PTR(-EAGAIN);

679

680

ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);

680

ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);

681

if (!ctx)

681

if (!ctx)

682

return ERR_PTR(-ENOMEM);

682

return ERR_PTR(-ENOMEM);

683

684

ctx->max_reqs = nr_events;

684

ctx->max_reqs = nr_events;

685

686

spin_lock_init(&ctx->ctx_lock);

686

spin_lock_init(&ctx->ctx_lock);

687

spin_lock_init(&ctx->completion_lock);

687

spin_lock_init(&ctx->completion_lock);

688

mutex_init(&ctx->ring_lock);

688

mutex_init(&ctx->ring_lock);

689

/* Protect against page migration throughout kiotx setup by keeping

689

/* Protect against page migration throughout kiotx setup by keeping

690

* the ring_lock mutex held until setup is complete. */

690

* the ring_lock mutex held until setup is complete. */

691

mutex_lock(&ctx->ring_lock);

691

mutex_lock(&ctx->ring_lock);

692

init_waitqueue_head(&ctx->wait);

692

init_waitqueue_head(&ctx->wait);

693

694

INIT_LIST_HEAD(&ctx->active_reqs);

694

INIT_LIST_HEAD(&ctx->active_reqs);

695

696

if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))

696

if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))

697

goto err;

697

goto err;

698

699

if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))

699

if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))

700

goto err;

700

goto err;

701

702

ctx->cpu = alloc_percpu(struct kioctx_cpu);

702

ctx->cpu = alloc_percpu(struct kioctx_cpu);

703

if (!ctx->cpu)

703

if (!ctx->cpu)

704

goto err;

704

goto err;

705

706

err = aio_setup_ring(ctx);

706

err = aio_setup_ring(ctx);

707

if (err < 0)

707

if (err < 0)

708

goto err;

708

goto err;

709

710

atomic_set(&ctx->reqs_available, ctx->nr_events - 1);

710

atomic_set(&ctx->reqs_available, ctx->nr_events - 1);

711

ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);

711

ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);

712

if (ctx->req_batch < 1)

712

if (ctx->req_batch < 1)

713

ctx->req_batch = 1;

713

ctx->req_batch = 1;

714

715

/* limit the number of system wide aios */

715

/* limit the number of system wide aios */

716

spin_lock(&aio_nr_lock);

716

spin_lock(&aio_nr_lock);

717

if (aio_nr + nr_events > (aio_max_nr * 2UL) ||

717

if (aio_nr + nr_events > (aio_max_nr * 2UL) ||

718

aio_nr + nr_events < aio_nr) {

718

aio_nr + nr_events < aio_nr) {

719

spin_unlock(&aio_nr_lock);

719

spin_unlock(&aio_nr_lock);

720

err = -EAGAIN;

720

err = -EAGAIN;

721

goto err_ctx;

721

goto err_ctx;

722

}

722

}

723

aio_nr += ctx->max_reqs;

723

aio_nr += ctx->max_reqs;

724

spin_unlock(&aio_nr_lock);

724

spin_unlock(&aio_nr_lock);

725

726

percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */

726

percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */

727

percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */

727

percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */

728

729

err = ioctx_add_table(ctx, mm);

729

err = ioctx_add_table(ctx, mm);

730

if (err)

730

if (err)

731

goto err_cleanup;

731

goto err_cleanup;

732

733

/* Release the ring_lock mutex now that all setup is complete. */

733

/* Release the ring_lock mutex now that all setup is complete. */

734

mutex_unlock(&ctx->ring_lock);

734

mutex_unlock(&ctx->ring_lock);

735

736

pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",

736

pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",

737

ctx, ctx->user_id, mm, ctx->nr_events);

737

ctx, ctx->user_id, mm, ctx->nr_events);

738

return ctx;

738

return ctx;

739

740

err_cleanup:

740

err_cleanup:

741

aio_nr_sub(ctx->max_reqs);

741

aio_nr_sub(ctx->max_reqs);

742

err_ctx:

742

err_ctx:

743

aio_free_ring(ctx);

743

aio_free_ring(ctx);

744

err:

744

err:

745

mutex_unlock(&ctx->ring_lock);

745

mutex_unlock(&ctx->ring_lock);

746

free_percpu(ctx->cpu);

746

free_percpu(ctx->cpu);

747

percpu_ref_exit(&ctx->reqs);

747

percpu_ref_exit(&ctx->reqs);

748

percpu_ref_exit(&ctx->users);

748

percpu_ref_exit(&ctx->users);

749

kmem_cache_free(kioctx_cachep, ctx);

749

kmem_cache_free(kioctx_cachep, ctx);

750

pr_debug("error allocating ioctx %d\n", err);

750

pr_debug("error allocating ioctx %d\n", err);

751

return ERR_PTR(err);

751

return ERR_PTR(err);

752

}

752

}

753

754

/* kill_ioctx

754

/* kill_ioctx

755

* Cancels all outstanding aio requests on an aio context. Used

755

* Cancels all outstanding aio requests on an aio context. Used

756

* when the processes owning a context have all exited to encourage

756

* when the processes owning a context have all exited to encourage

757

* the rapid destruction of the kioctx.

757

* the rapid destruction of the kioctx.

758

*/

758

*/

759

static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,

759

static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,

760

struct completion *requests_done)

760

struct completion *requests_done)

761

{

761

{

762

struct kioctx_table *table;

762

struct kioctx_table *table;

763

764

if (atomic_xchg(&ctx->dead, 1))

764

if (atomic_xchg(&ctx->dead, 1))

765

return -EINVAL;

765

return -EINVAL;

766

767

768

spin_lock(&mm->ioctx_lock);

768

spin_lock(&mm->ioctx_lock);

769

table = rcu_dereference_raw(mm->ioctx_table);

769

table = rcu_dereference_raw(mm->ioctx_table);

770

WARN_ON(ctx != table->table[ctx->id]);

770

WARN_ON(ctx != table->table[ctx->id]);

771

table->table[ctx->id] = NULL;

771

table->table[ctx->id] = NULL;

772

spin_unlock(&mm->ioctx_lock);

772

spin_unlock(&mm->ioctx_lock);

773

774

/* percpu_ref_kill() will do the necessary call_rcu() */

774

/* percpu_ref_kill() will do the necessary call_rcu() */

775

wake_up_all(&ctx->wait);

775

wake_up_all(&ctx->wait);

776

777

/*

777

/*

778

* It'd be more correct to do this in free_ioctx(), after all

778

* It'd be more correct to do this in free_ioctx(), after all

779

* the outstanding kiocbs have finished - but by then io_destroy

779

* the outstanding kiocbs have finished - but by then io_destroy

780

* has already returned, so io_setup() could potentially return

780

* has already returned, so io_setup() could potentially return

781

* -EAGAIN with no ioctxs actually in use (as far as userspace

781

* -EAGAIN with no ioctxs actually in use (as far as userspace

782

* could tell).

782

* could tell).

783

*/

783

*/

784

aio_nr_sub(ctx->max_reqs);

784

aio_nr_sub(ctx->max_reqs);

785

786

if (ctx->mmap_size)

786

if (ctx->mmap_size)

787

vm_munmap(ctx->mmap_base, ctx->mmap_size);

787

vm_munmap(ctx->mmap_base, ctx->mmap_size);

788

789

ctx->requests_done = requests_done;

789

ctx->requests_done = requests_done;

790

percpu_ref_kill(&ctx->users);

790

percpu_ref_kill(&ctx->users);

791

return 0;

791

return 0;

792

}

792

}

793

794

/* wait_on_sync_kiocb:

794

/* wait_on_sync_kiocb:

795

* Waits on the given sync kiocb to complete.

795

* Waits on the given sync kiocb to complete.

796

*/

796

*/

797

ssize_t wait_on_sync_kiocb(struct kiocb *req)

797

ssize_t wait_on_sync_kiocb(struct kiocb *req)

798

{

798

{

799

while (!req->ki_ctx) {

799

while (!req->ki_ctx) {

800

set_current_state(TASK_UNINTERRUPTIBLE);

800

set_current_state(TASK_UNINTERRUPTIBLE);

801

if (req->ki_ctx)

801

if (req->ki_ctx)

802

break;

802

break;

803

io_schedule();

803

io_schedule();

804

}

804

}

805

__set_current_state(TASK_RUNNING);

805

__set_current_state(TASK_RUNNING);

806

return req->ki_user_data;

806

return req->ki_user_data;

807

}

807

}

808

EXPORT_SYMBOL(wait_on_sync_kiocb);

808

EXPORT_SYMBOL(wait_on_sync_kiocb);

809

810

/*

810

/*

811

* exit_aio: called when the last user of mm goes away. At this point, there is

811

* exit_aio: called when the last user of mm goes away. At this point, there is

812

* no way for any new requests to be submited or any of the io_* syscalls to be

812

* no way for any new requests to be submited or any of the io_* syscalls to be

813

* called on the context.

813

* called on the context.

814

*

814

*

815

* There may be outstanding kiocbs, but free_ioctx() will explicitly wait on

815

* There may be outstanding kiocbs, but free_ioctx() will explicitly wait on

816

* them.

816

* them.

817

*/

817

*/

818

void exit_aio(struct mm_struct *mm)

818

void exit_aio(struct mm_struct *mm)

819

{

819

{

820

struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);

820

struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);

821

int i;

821

int i;

822

823

if (!table)

823

if (!table)

824

return;

824

return;

825

826

for (i = 0; i < table->nr; ++i) {

826

for (i = 0; i < table->nr; ++i) {

827

struct kioctx *ctx = table->table[i];

827

struct kioctx *ctx = table->table[i];

828

struct completion requests_done =

828

struct completion requests_done =

829

COMPLETION_INITIALIZER_ONSTACK(requests_done);

829

COMPLETION_INITIALIZER_ONSTACK(requests_done);

830

831

if (!ctx)

831

if (!ctx)

832

continue;

832

continue;

833

/*

833

/*

834

* We don't need to bother with munmap() here - exit_mmap(mm)

834

* We don't need to bother with munmap() here - exit_mmap(mm)

835

* is coming and it'll unmap everything. And we simply can't,

835

* is coming and it'll unmap everything. And we simply can't,

836

* this is not necessarily our ->mm.

836

* this is not necessarily our ->mm.

837

* Since kill_ioctx() uses non-zero ->mmap_size as indicator

837

* Since kill_ioctx() uses non-zero ->mmap_size as indicator

838

* that it needs to unmap the area, just set it to 0.

838

* that it needs to unmap the area, just set it to 0.

839

*/

839

*/

840

ctx->mmap_size = 0;

840

ctx->mmap_size = 0;

841

kill_ioctx(mm, ctx, &requests_done);

841

kill_ioctx(mm, ctx, &requests_done);

842

843

/* Wait until all IO for the context are done. */

843

/* Wait until all IO for the context are done. */

844

wait_for_completion(&requests_done);

844

wait_for_completion(&requests_done);

845

}

845

}

846

847

RCU_INIT_POINTER(mm->ioctx_table, NULL);

847

RCU_INIT_POINTER(mm->ioctx_table, NULL);

848

kfree(table);

848

kfree(table);

849

}

849

}

850

851

static void put_reqs_available(struct kioctx *ctx, unsigned nr)

851

static void put_reqs_available(struct kioctx *ctx, unsigned nr)

852

{

852

{

853

struct kioctx_cpu *kcpu;

853

struct kioctx_cpu *kcpu;

854

unsigned long flags;

854

unsigned long flags;

855

856

local_irq_save(flags);

856

local_irq_save(flags);

857

kcpu = this_cpu_ptr(ctx->cpu);

857

kcpu = this_cpu_ptr(ctx->cpu);

858

kcpu->reqs_available += nr;

858

kcpu->reqs_available += nr;

859

860

while (kcpu->reqs_available >= ctx->req_batch * 2) {

860

while (kcpu->reqs_available >= ctx->req_batch * 2) {

861

kcpu->reqs_available -= ctx->req_batch;

861

kcpu->reqs_available -= ctx->req_batch;

862

atomic_add(ctx->req_batch, &ctx->reqs_available);

862

atomic_add(ctx->req_batch, &ctx->reqs_available);

863

}

863

}

864

865

local_irq_restore(flags);

865

local_irq_restore(flags);

866

}

866

}

867

868

static bool get_reqs_available(struct kioctx *ctx)

868

static bool get_reqs_available(struct kioctx *ctx)

869

{

869

{

870

struct kioctx_cpu *kcpu;

870

struct kioctx_cpu *kcpu;

871

bool ret = false;

871

bool ret = false;

872

unsigned long flags;

872

unsigned long flags;

873

874

local_irq_save(flags);

874

local_irq_save(flags);

875

kcpu = this_cpu_ptr(ctx->cpu);

875

kcpu = this_cpu_ptr(ctx->cpu);

876

if (!kcpu->reqs_available) {

876

if (!kcpu->reqs_available) {

877

int old, avail = atomic_read(&ctx->reqs_available);

877

int old, avail = atomic_read(&ctx->reqs_available);

878

879

do {

879

do {

880

if (avail < ctx->req_batch)

880

if (avail < ctx->req_batch)

881

goto out;

881

goto out;

882

883

old = avail;

883

old = avail;

884

avail = atomic_cmpxchg(&ctx->reqs_available,

884

avail = atomic_cmpxchg(&ctx->reqs_available,

885

avail, avail - ctx->req_batch);

885

avail, avail - ctx->req_batch);

886

} while (avail != old);

886

} while (avail != old);

887

888

kcpu->reqs_available += ctx->req_batch;

888

kcpu->reqs_available += ctx->req_batch;

889

}

889

}

890

891

ret = true;

891

ret = true;

892

kcpu->reqs_available--;

892

kcpu->reqs_available--;

893

out:

893

out:

894

local_irq_restore(flags);

894

local_irq_restore(flags);

895

return ret;

895

return ret;

896

}

896

}

897

898

/* refill_reqs_available

898

/* refill_reqs_available

899

* Updates the reqs_available reference counts used for tracking the

899

* Updates the reqs_available reference counts used for tracking the

900

* number of free slots in the completion ring. This can be called

900

* number of free slots in the completion ring. This can be called

901

* from aio_complete() (to optimistically update reqs_available) or

901

* from aio_complete() (to optimistically update reqs_available) or

902

* from aio_get_req() (the we're out of events case). It must be

902

* from aio_get_req() (the we're out of events case). It must be

903

* called holding ctx->completion_lock.

903

* called holding ctx->completion_lock.

904

*/

904

*/

905

static void refill_reqs_available(struct kioctx *ctx, unsigned head,

905

static void refill_reqs_available(struct kioctx *ctx, unsigned head,

906

unsigned tail)

906

unsigned tail)

907

{

907

{

908

unsigned events_in_ring, completed;

908

unsigned events_in_ring, completed;

909

910

/* Clamp head since userland can write to it. */

910

/* Clamp head since userland can write to it. */

911

head %= ctx->nr_events;

911

head %= ctx->nr_events;

912

if (head <= tail)

912

if (head <= tail)

913

events_in_ring = tail - head;

913

events_in_ring = tail - head;

914

else

914

else

915

events_in_ring = ctx->nr_events - (head - tail);

915

events_in_ring = ctx->nr_events - (head - tail);

916

917

completed = ctx->completed_events;

917

completed = ctx->completed_events;

918

if (events_in_ring < completed)

918

if (events_in_ring < completed)

919

completed -= events_in_ring;

919

completed -= events_in_ring;

920

else

920

else

921

completed = 0;

921

completed = 0;

922

923

if (!completed)

923

if (!completed)

924

return;

924

return;

925

926

ctx->completed_events -= completed;

926

ctx->completed_events -= completed;

927

put_reqs_available(ctx, completed);

927

put_reqs_available(ctx, completed);

928

}

928

}

929

930

/* user_refill_reqs_available

930

/* user_refill_reqs_available

931

* Called to refill reqs_available when aio_get_req() encounters an

931

* Called to refill reqs_available when aio_get_req() encounters an

932

* out of space in the completion ring.

932

* out of space in the completion ring.

933

*/

933

*/

934

static void user_refill_reqs_available(struct kioctx *ctx)

934

static void user_refill_reqs_available(struct kioctx *ctx)

935

{

935

{

936

spin_lock_irq(&ctx->completion_lock);

936

spin_lock_irq(&ctx->completion_lock);

937

if (ctx->completed_events) {

937

if (ctx->completed_events) {

938

struct aio_ring *ring;

938

struct aio_ring *ring;

939

unsigned head;

939

unsigned head;

940

941

/* Access of ring->head may race with aio_read_events_ring()

941

/* Access of ring->head may race with aio_read_events_ring()

942

* here, but that's okay since whether we read the old version

942

* here, but that's okay since whether we read the old version

943

* or the new version, and either will be valid. The important

943

* or the new version, and either will be valid. The important

944

* part is that head cannot pass tail since we prevent

944

* part is that head cannot pass tail since we prevent

945

* aio_complete() from updating tail by holding

945

* aio_complete() from updating tail by holding

946

* ctx->completion_lock. Even if head is invalid, the check

946

* ctx->completion_lock. Even if head is invalid, the check

947

* against ctx->completed_events below will make sure we do the

947

* against ctx->completed_events below will make sure we do the

948

* safe/right thing.

948

* safe/right thing.

949

*/

949

*/

950

ring = kmap_atomic(ctx->ring_pages[0]);

950

ring = kmap_atomic(ctx->ring_pages[0]);

951

head = ring->head;

951

head = ring->head;

952

kunmap_atomic(ring);

952

kunmap_atomic(ring);

953

954

refill_reqs_available(ctx, head, ctx->tail);

954

refill_reqs_available(ctx, head, ctx->tail);

955

}

955

}

956

957

spin_unlock_irq(&ctx->completion_lock);

957

spin_unlock_irq(&ctx->completion_lock);

958

}

958

}

959

960

/* aio_get_req

960

/* aio_get_req

961

* Allocate a slot for an aio request.

961

* Allocate a slot for an aio request.

962

* Returns NULL if no requests are free.

962

* Returns NULL if no requests are free.

963

*/

963

*/

964

static inline struct kiocb *aio_get_req(struct kioctx *ctx)

964

static inline struct kiocb *aio_get_req(struct kioctx *ctx)

965

{

965

{

966

struct kiocb *req;

966

struct kiocb *req;

967

968

if (!get_reqs_available(ctx)) {

968

if (!get_reqs_available(ctx)) {

969

user_refill_reqs_available(ctx);

969

user_refill_reqs_available(ctx);

970

if (!get_reqs_available(ctx))

970

if (!get_reqs_available(ctx))

971

return NULL;

971

return NULL;

972

}

972

}

973

974

req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);

974

req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);

975

if (unlikely(!req))

975

if (unlikely(!req))

976

goto out_put;

976

goto out_put;

977

978

percpu_ref_get(&ctx->reqs);

978

percpu_ref_get(&ctx->reqs);

979

980

req->ki_ctx = ctx;

980

req->ki_ctx = ctx;

981

return req;

981

return req;

982

out_put:

982

out_put:

983

put_reqs_available(ctx, 1);

983

put_reqs_available(ctx, 1);

984

return NULL;

984

return NULL;

985

}

985

}

986

987

static void kiocb_free(struct kiocb *req)

987

static void kiocb_free(struct kiocb *req)

988

{

988

{

989

if (req->ki_filp)

989

if (req->ki_filp)

990

fput(req->ki_filp);

990

fput(req->ki_filp);

991

if (req->ki_eventfd != NULL)

991

if (req->ki_eventfd != NULL)

992

eventfd_ctx_put(req->ki_eventfd);

992

eventfd_ctx_put(req->ki_eventfd);

993

kmem_cache_free(kiocb_cachep, req);

993

kmem_cache_free(kiocb_cachep, req);

994

}

994

}

995

996

static struct kioctx *lookup_ioctx(unsigned long ctx_id)

996

static struct kioctx *lookup_ioctx(unsigned long ctx_id)

997

{

997

{

998

struct aio_ring __user *ring = (void __user *)ctx_id;

998

struct aio_ring __user *ring = (void __user *)ctx_id;

999

struct mm_struct *mm = current->mm;

999

struct mm_struct *mm = current->mm;

1000

struct kioctx *ctx, *ret = NULL;

1000

struct kioctx *ctx, *ret = NULL;

1001

struct kioctx_table *table;

1001

struct kioctx_table *table;

1002

unsigned id;

1002

unsigned id;

1003

1004

if (get_user(id, &ring->id))

1004

if (get_user(id, &ring->id))

1005

return NULL;

1005

return NULL;

1006

1007

rcu_read_lock();

1007

rcu_read_lock();

1008

table = rcu_dereference(mm->ioctx_table);

1008

table = rcu_dereference(mm->ioctx_table);

1009

1010

if (!table || id >= table->nr)

1010

if (!table || id >= table->nr)

1011

goto out;

1011

goto out;

1012

1013

ctx = table->table[id];

1013

ctx = table->table[id];

1014

if (ctx && ctx->user_id == ctx_id) {

1014

if (ctx && ctx->user_id == ctx_id) {

1015

percpu_ref_get(&ctx->users);

1015

percpu_ref_get(&ctx->users);

1016

ret = ctx;

1016

ret = ctx;

1017

}

1017

}

1018

out:

1018

out:

1019

rcu_read_unlock();

1019

rcu_read_unlock();

1020

return ret;

1020

return ret;

1021

}

1021

}

1022

1023

/* aio_complete

1023

/* aio_complete

1024

* Called when the io request on the given iocb is complete.

1024

* Called when the io request on the given iocb is complete.

1025

*/

1025

*/

1026

void aio_complete(struct kiocb *iocb, long res, long res2)

1026

void aio_complete(struct kiocb *iocb, long res, long res2)

1027

{

1027

{

1028

struct kioctx *ctx = iocb->ki_ctx;

1028

struct kioctx *ctx = iocb->ki_ctx;

1029

struct aio_ring *ring;

1029

struct aio_ring *ring;

1030

struct io_event *ev_page, *event;

1030

struct io_event *ev_page, *event;

1031

unsigned tail, pos, head;

1031

unsigned tail, pos, head;

1032

unsigned long flags;

1032

unsigned long flags;

1033

1034

/*

1034

/*

1035

* Special case handling for sync iocbs:

1035

* Special case handling for sync iocbs:

1036

* - events go directly into the iocb for fast handling

1036

* - events go directly into the iocb for fast handling

1037

* - the sync task with the iocb in its stack holds the single iocb

1037

* - the sync task with the iocb in its stack holds the single iocb

1038

* ref, no other paths have a way to get another ref

1038

* ref, no other paths have a way to get another ref

1039

* - the sync task helpfully left a reference to itself in the iocb

1039

* - the sync task helpfully left a reference to itself in the iocb

1040

*/

1040

*/

1041

if (is_sync_kiocb(iocb)) {

1041

if (is_sync_kiocb(iocb)) {

1042

iocb->ki_user_data = res;

1042

iocb->ki_user_data = res;

1043

smp_wmb();

1043

smp_wmb();

1044

iocb->ki_ctx = ERR_PTR(-EXDEV);

1044

iocb->ki_ctx = ERR_PTR(-EXDEV);

1045

wake_up_process(iocb->ki_obj.tsk);

1045

wake_up_process(iocb->ki_obj.tsk);

1046

return;

1046

return;

1047

}

1047

}

1048

1049

if (iocb->ki_list.next) {

1049

if (iocb->ki_list.next) {

1050

unsigned long flags;

1050

unsigned long flags;

1051

1052

spin_lock_irqsave(&ctx->ctx_lock, flags);

1052

spin_lock_irqsave(&ctx->ctx_lock, flags);

1053

list_del(&iocb->ki_list);

1053

list_del(&iocb->ki_list);

1054

spin_unlock_irqrestore(&ctx->ctx_lock, flags);

1054

spin_unlock_irqrestore(&ctx->ctx_lock, flags);

1055

}

1055

}

1056

1057

/*

1057

/*

1058

* Add a completion event to the ring buffer. Must be done holding

1058

* Add a completion event to the ring buffer. Must be done holding

1059

* ctx->completion_lock to prevent other code from messing with the tail

1059

* ctx->completion_lock to prevent other code from messing with the tail

1060

* pointer since we might be called from irq context.

1060

* pointer since we might be called from irq context.

1061

*/

1061

*/

1062

spin_lock_irqsave(&ctx->completion_lock, flags);

1062

spin_lock_irqsave(&ctx->completion_lock, flags);

1063

1064

tail = ctx->tail;

1064

tail = ctx->tail;

1065

pos = tail + AIO_EVENTS_OFFSET;

1065

pos = tail + AIO_EVENTS_OFFSET;

1066

1067

if (++tail >= ctx->nr_events)

1067

if (++tail >= ctx->nr_events)

1068

tail = 0;

1068

tail = 0;

1069

1070

ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);

1070

ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);

1071

event = ev_page + pos % AIO_EVENTS_PER_PAGE;

1071

event = ev_page + pos % AIO_EVENTS_PER_PAGE;

1072

1073

event->obj = (u64)(unsigned long)iocb->ki_obj.user;

1073

event->obj = (u64)(unsigned long)iocb->ki_obj.user;

1074

event->data = iocb->ki_user_data;

1074

event->data = iocb->ki_user_data;

1075

event->res = res;

1075

event->res = res;

1076

event->res2 = res2;

1076

event->res2 = res2;

1077

1078

kunmap_atomic(ev_page);

1078

kunmap_atomic(ev_page);

1079

flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);

1079

flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);

1080

1081

pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",

1081

pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",

1082

ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,

1082

ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,

1083

res, res2);

1083

res, res2);

1084

1085

/* after flagging the request as done, we

1085

/* after flagging the request as done, we

1086

* must never even look at it again

1086

* must never even look at it again

1087

*/

1087

*/

1088

smp_wmb(); /* make event visible before updating tail */

1088

smp_wmb(); /* make event visible before updating tail */

1089

1090

ctx->tail = tail;

1090

ctx->tail = tail;

1091

1092

ring = kmap_atomic(ctx->ring_pages[0]);

1092

ring = kmap_atomic(ctx->ring_pages[0]);

1093

head = ring->head;

1093

head = ring->head;

1094

ring->tail = tail;

1094

ring->tail = tail;

1095

kunmap_atomic(ring);

1095

kunmap_atomic(ring);

1096

flush_dcache_page(ctx->ring_pages[0]);

1096

flush_dcache_page(ctx->ring_pages[0]);

1097

1098

ctx->completed_events++;

1098

ctx->completed_events++;

1099

if (ctx->completed_events > 1)

1099

if (ctx->completed_events > 1)

1100

refill_reqs_available(ctx, head, tail);

1100

refill_reqs_available(ctx, head, tail);

1101

spin_unlock_irqrestore(&ctx->completion_lock, flags);

1101

spin_unlock_irqrestore(&ctx->completion_lock, flags);

1102

1103

pr_debug("added to ring %p at [%u]\n", iocb, tail);

1103

pr_debug("added to ring %p at [%u]\n", iocb, tail);

1104

1105

/*

1105

/*

1106

* Check if the user asked us to deliver the result through an

1106

* Check if the user asked us to deliver the result through an

1107

* eventfd. The eventfd_signal() function is safe to be called

1107

* eventfd. The eventfd_signal() function is safe to be called

1108

* from IRQ context.

1108

* from IRQ context.

1109

*/

1109

*/

1110

if (iocb->ki_eventfd != NULL)

1110

if (iocb->ki_eventfd != NULL)

1111

eventfd_signal(iocb->ki_eventfd, 1);

1111

eventfd_signal(iocb->ki_eventfd, 1);

1112

1113

/* everything turned out well, dispose of the aiocb. */

1113

/* everything turned out well, dispose of the aiocb. */

1114

kiocb_free(iocb);

1114

kiocb_free(iocb);

1115

1116

/*

1116

/*

1117

* We have to order our ring_info tail store above and test

1117

* We have to order our ring_info tail store above and test

1118

* of the wait list below outside the wait lock. This is

1118

* of the wait list below outside the wait lock. This is

1119

* like in wake_up_bit() where clearing a bit has to be

1119

* like in wake_up_bit() where clearing a bit has to be

1120

* ordered with the unlocked test.

1120

* ordered with the unlocked test.

1121

*/

1121

*/

1122

smp_mb();

1122

smp_mb();

1123

1124

if (waitqueue_active(&ctx->wait))

1124

if (waitqueue_active(&ctx->wait))

1125

wake_up(&ctx->wait);

1125

wake_up(&ctx->wait);

1126

1127

percpu_ref_put(&ctx->reqs);

1127

percpu_ref_put(&ctx->reqs);

1128

}

1128

}

1129

EXPORT_SYMBOL(aio_complete);

1129

EXPORT_SYMBOL(aio_complete);

1130

1131

/* aio_read_events_ring

1131

/* aio_read_events_ring

1132

* Pull an event off of the ioctx's event ring. Returns the number of

1132

* Pull an event off of the ioctx's event ring. Returns the number of

1133

* events fetched

1133

* events fetched

1134

*/

1134

*/

1135

static long aio_read_events_ring(struct kioctx *ctx,

1135

static long aio_read_events_ring(struct kioctx *ctx,

1136

struct io_event __user *event, long nr)

1136

struct io_event __user *event, long nr)

1137

{

1137

{

1138

struct aio_ring *ring;

1138

struct aio_ring *ring;

1139

unsigned head, tail, pos;

1139

unsigned head, tail, pos;

1140

long ret = 0;

1140

long ret = 0;

1141

int copy_ret;

1141

int copy_ret;

1142

1143

/*

1144

* The mutex can block and wake us up and that will cause

1145

* wait_event_interruptible_hrtimeout() to schedule without sleeping

1146

* and repeat. This should be rare enough that it doesn't cause

1147

* peformance issues. See the comment in read_events() for more detail.

1148

*/

1149

sched_annotate_sleep();

1143

mutex_lock(&ctx->ring_lock);

1150

mutex_lock(&ctx->ring_lock);

1144

1151

1145

/* Access to ->ring_pages here is protected by ctx->ring_lock. */

1152

/* Access to ->ring_pages here is protected by ctx->ring_lock. */

1146

ring = kmap_atomic(ctx->ring_pages[0]);

1153

ring = kmap_atomic(ctx->ring_pages[0]);

1147

head = ring->head;

1154

head = ring->head;

1148

tail = ring->tail;

1155

tail = ring->tail;

1149

kunmap_atomic(ring);

1156

kunmap_atomic(ring);

1150

1157

1151

/*

1158

/*

1152

* Ensure that once we've read the current tail pointer, that

1159

* Ensure that once we've read the current tail pointer, that

1153

* we also see the events that were stored up to the tail.

1160

* we also see the events that were stored up to the tail.

1154

*/

1161

*/

1155

smp_rmb();

1162

smp_rmb();

1156

1163

1157

pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);

1164

pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);

1158

1165

1159

if (head == tail)

1166

if (head == tail)

1160

goto out;

1167

goto out;

1161

1168

1162

head %= ctx->nr_events;

1169

head %= ctx->nr_events;

1163

tail %= ctx->nr_events;

1170

tail %= ctx->nr_events;

1164

1171

1165

while (ret < nr) {

1172

while (ret < nr) {

1166

long avail;

1173

long avail;

1167

struct io_event *ev;

1174

struct io_event *ev;

1168

struct page *page;

1175

struct page *page;

1169

1176

1170

avail = (head <= tail ? tail : ctx->nr_events) - head;

1177

avail = (head <= tail ? tail : ctx->nr_events) - head;

1171

if (head == tail)

1178

if (head == tail)

1172

break;

1179

break;

1173

1180

1174

avail = min(avail, nr - ret);

1181

avail = min(avail, nr - ret);

1175

avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -

1182

avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -

1176

((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));

1183

((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));

1177

1184

1178

pos = head + AIO_EVENTS_OFFSET;

1185

pos = head + AIO_EVENTS_OFFSET;

1179

page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];

1186

page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];

1180

pos %= AIO_EVENTS_PER_PAGE;

1187

pos %= AIO_EVENTS_PER_PAGE;

1181

1188

1182

ev = kmap(page);

1189

ev = kmap(page);

1183

copy_ret = copy_to_user(event + ret, ev + pos,

1190

copy_ret = copy_to_user(event + ret, ev + pos,

1184

sizeof(*ev) * avail);

1191

sizeof(*ev) * avail);

1185

kunmap(page);

1192

kunmap(page);

1186

1193

1187

if (unlikely(copy_ret)) {

1194

if (unlikely(copy_ret)) {

1188

ret = -EFAULT;

1195

ret = -EFAULT;

1189

goto out;

1196

goto out;

1190

}

1197

}

1191

1198

1192

ret += avail;

1199

ret += avail;

1193

head += avail;

1200

head += avail;

1194

head %= ctx->nr_events;

1201

head %= ctx->nr_events;

1195

}

1202

}

1196

1203

1197

ring = kmap_atomic(ctx->ring_pages[0]);

1204

ring = kmap_atomic(ctx->ring_pages[0]);

1198

ring->head = head;

1205

ring->head = head;

1199

kunmap_atomic(ring);

1206

kunmap_atomic(ring);

1200

flush_dcache_page(ctx->ring_pages[0]);

1207

flush_dcache_page(ctx->ring_pages[0]);

1201

1208

1202

pr_debug("%li h%u t%u\n", ret, head, tail);

1209

pr_debug("%li h%u t%u\n", ret, head, tail);

1203

out:

1210

out:

1204

mutex_unlock(&ctx->ring_lock);

1211

mutex_unlock(&ctx->ring_lock);

1205

1212

1206

return ret;

1213

return ret;

1207

}

1214

}

1208

1215

1209

static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,

1216

static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,

1210

struct io_event __user *event, long *i)

1217

struct io_event __user *event, long *i)

1211

{

1218

{

1212

long ret = aio_read_events_ring(ctx, event + *i, nr - *i);

1219

long ret = aio_read_events_ring(ctx, event + *i, nr - *i);

1213

1220

1214

if (ret > 0)

1221

if (ret > 0)

1215

*i += ret;

1222

*i += ret;

1216

1223

1217

if (unlikely(atomic_read(&ctx->dead)))

1224

if (unlikely(atomic_read(&ctx->dead)))

1218

ret = -EINVAL;

1225

ret = -EINVAL;

1219

1226

1220

if (!*i)

1227

if (!*i)

1221

*i = ret;

1228

*i = ret;

1222

1229

1223

return ret < 0 || *i >= min_nr;

1230

return ret < 0 || *i >= min_nr;

1224

}

1231

}

1225

1232

1226

static long read_events(struct kioctx *ctx, long min_nr, long nr,

1233

static long read_events(struct kioctx *ctx, long min_nr, long nr,

1227

struct io_event __user *event,

1234

struct io_event __user *event,

1228

struct timespec __user *timeout)

1235

struct timespec __user *timeout)

1229

{

1236

{

1230

ktime_t until = { .tv64 = KTIME_MAX };

1237

ktime_t until = { .tv64 = KTIME_MAX };

1231

long ret = 0;

1238

long ret = 0;

1232

1239

1233

if (timeout) {

1240

if (timeout) {

1234

struct timespec ts;

1241

struct timespec ts;

1235

1242

1236

if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))

1243

if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))

1237

return -EFAULT;

1244

return -EFAULT;

1238

1245

1239

until = timespec_to_ktime(ts);

1246

until = timespec_to_ktime(ts);

1240

}

1247

}

1241

1248

1242

/*

1249

/*

1243

* Note that aio_read_events() is being called as the conditional - i.e.

1250

* Note that aio_read_events() is being called as the conditional - i.e.

1244

* we're calling it after prepare_to_wait() has set task state to

1251

* we're calling it after prepare_to_wait() has set task state to

1245

* TASK_INTERRUPTIBLE.

1252

* TASK_INTERRUPTIBLE.

1246

*

1253

*

1247

* But aio_read_events() can block, and if it blocks it's going to flip

1254

* But aio_read_events() can block, and if it blocks it's going to flip

1248

* the task state back to TASK_RUNNING.

1255

* the task state back to TASK_RUNNING.

1249

*

1256

*

1250

* This should be ok, provided it doesn't flip the state back to

1257

* This should be ok, provided it doesn't flip the state back to

1251

* TASK_RUNNING and return 0 too much - that causes us to spin. That

1258

* TASK_RUNNING and return 0 too much - that causes us to spin. That

1252

* will only happen if the mutex_lock() call blocks, and we then find

1259

* will only happen if the mutex_lock() call blocks, and we then find

1253

* the ringbuffer empty. So in practice we should be ok, but it's

1260

* the ringbuffer empty. So in practice we should be ok, but it's

1254

* something to be aware of when touching this code.

1261

* something to be aware of when touching this code.

1255

*/

1262

*/

1256

if (until.tv64 == 0)

1263

if (until.tv64 == 0)

1257

aio_read_events(ctx, min_nr, nr, event, &ret);

1264

aio_read_events(ctx, min_nr, nr, event, &ret);

1258

else

1265

else

1259

wait_event_interruptible_hrtimeout(ctx->wait,

1266

wait_event_interruptible_hrtimeout(ctx->wait,

1260

aio_read_events(ctx, min_nr, nr, event, &ret),

1267

aio_read_events(ctx, min_nr, nr, event, &ret),

1261

until);

1268

until);

1262

1269

1263

if (!ret && signal_pending(current))

1270

if (!ret && signal_pending(current))

1264

ret = -EINTR;

1271

ret = -EINTR;

1265

1272

1266

return ret;

1273

return ret;

1267

}

1274

}

1268

1275

1269

/* sys_io_setup:

1276

/* sys_io_setup:

1270

* Create an aio_context capable of receiving at least nr_events.

1277

* Create an aio_context capable of receiving at least nr_events.

1271

* ctxp must not point to an aio_context that already exists, and

1278

* ctxp must not point to an aio_context that already exists, and

1272

* must be initialized to 0 prior to the call. On successful

1279

* must be initialized to 0 prior to the call. On successful

1273

* creation of the aio_context, *ctxp is filled in with the resulting

1280

* creation of the aio_context, *ctxp is filled in with the resulting

1274

* handle. May fail with -EINVAL if *ctxp is not initialized,

1281

* handle. May fail with -EINVAL if *ctxp is not initialized,

1275

* if the specified nr_events exceeds internal limits. May fail

1282

* if the specified nr_events exceeds internal limits. May fail

1276

* with -EAGAIN if the specified nr_events exceeds the user's limit

1283

* with -EAGAIN if the specified nr_events exceeds the user's limit

1277

* of available events. May fail with -ENOMEM if insufficient kernel

1284

* of available events. May fail with -ENOMEM if insufficient kernel

1278

* resources are available. May fail with -EFAULT if an invalid

1285

* resources are available. May fail with -EFAULT if an invalid

1279

* pointer is passed for ctxp. Will fail with -ENOSYS if not

1286

* pointer is passed for ctxp. Will fail with -ENOSYS if not

1280

* implemented.

1287

* implemented.

1281

*/

1288

*/

1282

SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)

1289

SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)

1283

{

1290

{

1284

struct kioctx *ioctx = NULL;

1291

struct kioctx *ioctx = NULL;

1285

unsigned long ctx;

1292

unsigned long ctx;

1286

long ret;

1293

long ret;

1287

1294

1288

ret = get_user(ctx, ctxp);

1295

ret = get_user(ctx, ctxp);

1289

if (unlikely(ret))

1296

if (unlikely(ret))

1290

goto out;

1297

goto out;

1291

1298

1292

ret = -EINVAL;

1299

ret = -EINVAL;

1293

if (unlikely(ctx || nr_events == 0)) {

1300

if (unlikely(ctx || nr_events == 0)) {

1294

pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",

1301

pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",

1295

ctx, nr_events);

1302

ctx, nr_events);

1296

goto out;

1303

goto out;

1297

}

1304

}

1298

1305

1299

ioctx = ioctx_alloc(nr_events);

1306

ioctx = ioctx_alloc(nr_events);

1300

ret = PTR_ERR(ioctx);

1307

ret = PTR_ERR(ioctx);

1301

if (!IS_ERR(ioctx)) {

1308

if (!IS_ERR(ioctx)) {

1302

ret = put_user(ioctx->user_id, ctxp);

1309

ret = put_user(ioctx->user_id, ctxp);

1303

if (ret)

1310

if (ret)

1304

kill_ioctx(current->mm, ioctx, NULL);

1311

kill_ioctx(current->mm, ioctx, NULL);

1305

percpu_ref_put(&ioctx->users);

1312

percpu_ref_put(&ioctx->users);

1306

}

1313

}

1307

1314

1308

out:

1315

out:

1309

return ret;

1316

return ret;

1310

}

1317

}

1311

1318

1312

/* sys_io_destroy:

1319

/* sys_io_destroy:

1313

* Destroy the aio_context specified. May cancel any outstanding

1320

* Destroy the aio_context specified. May cancel any outstanding

1314

* AIOs and block on completion. Will fail with -ENOSYS if not

1321

* AIOs and block on completion. Will fail with -ENOSYS if not

1315

* implemented. May fail with -EINVAL if the context pointed to

1322

* implemented. May fail with -EINVAL if the context pointed to

1316

* is invalid.

1323

* is invalid.

1317

*/

1324

*/

1318

SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)

1325

SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)

1319

{

1326

{

1320

struct kioctx *ioctx = lookup_ioctx(ctx);

1327

struct kioctx *ioctx = lookup_ioctx(ctx);

1321

if (likely(NULL != ioctx)) {

1328

if (likely(NULL != ioctx)) {

1322

struct completion requests_done =

1329

struct completion requests_done =

1323

COMPLETION_INITIALIZER_ONSTACK(requests_done);

1330

COMPLETION_INITIALIZER_ONSTACK(requests_done);

1324

int ret;

1331

int ret;

1325

1332

1326

/* Pass requests_done to kill_ioctx() where it can be set

1333

/* Pass requests_done to kill_ioctx() where it can be set

1327

* in a thread-safe way. If we try to set it here then we have

1334

* in a thread-safe way. If we try to set it here then we have

1328

* a race condition if two io_destroy() called simultaneously.

1335

* a race condition if two io_destroy() called simultaneously.

1329

*/

1336

*/

1330

ret = kill_ioctx(current->mm, ioctx, &requests_done);

1337

ret = kill_ioctx(current->mm, ioctx, &requests_done);

1331

percpu_ref_put(&ioctx->users);

1338

percpu_ref_put(&ioctx->users);

1332

1339

1333

/* Wait until all IO for the context are done. Otherwise kernel

1340

/* Wait until all IO for the context are done. Otherwise kernel

1334

* keep using user-space buffers even if user thinks the context

1341

* keep using user-space buffers even if user thinks the context

1335

* is destroyed.

1342

* is destroyed.

1336

*/

1343

*/

1337

if (!ret)

1344

if (!ret)

1338

wait_for_completion(&requests_done);

1345

wait_for_completion(&requests_done);

1339

1346

1340

return ret;

1347

return ret;

1341

}

1348

}

1342

pr_debug("EINVAL: io_destroy: invalid context id\n");

1349

pr_debug("EINVAL: io_destroy: invalid context id\n");

1343

return -EINVAL;

1350

return -EINVAL;

1344

}

1351

}

1345

1352

1346

typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,

1353

typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,

1347

unsigned long, loff_t);

1354

unsigned long, loff_t);

1348

typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);

1355

typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);

1349

1356

1350

static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,

1357

static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,

1351

int rw, char __user *buf,

1358

int rw, char __user *buf,

1352

unsigned long *nr_segs,

1359

unsigned long *nr_segs,

1353

struct iovec **iovec,

1360

struct iovec **iovec,

1354

bool compat)

1361

bool compat)

1355

{

1362

{

1356

ssize_t ret;

1363

ssize_t ret;

1357

1364

1358

*nr_segs = kiocb->ki_nbytes;

1365

*nr_segs = kiocb->ki_nbytes;

1359

1366

1360

#ifdef CONFIG_COMPAT

1367

#ifdef CONFIG_COMPAT

1361

if (compat)

1368

if (compat)

1362

ret = compat_rw_copy_check_uvector(rw,

1369

ret = compat_rw_copy_check_uvector(rw,

1363

(struct compat_iovec __user *)buf,

1370

(struct compat_iovec __user *)buf,

1364

*nr_segs, UIO_FASTIOV, *iovec, iovec);

1371

*nr_segs, UIO_FASTIOV, *iovec, iovec);

1365

else

1372

else

1366

#endif

1373

#endif

1367

ret = rw_copy_check_uvector(rw,

1374

ret = rw_copy_check_uvector(rw,

1368

(struct iovec __user *)buf,

1375

(struct iovec __user *)buf,

1369

*nr_segs, UIO_FASTIOV, *iovec, iovec);

1376

*nr_segs, UIO_FASTIOV, *iovec, iovec);

1370

if (ret < 0)

1377

if (ret < 0)

1371

return ret;

1378

return ret;

1372

1379

1373

/* ki_nbytes now reflect bytes instead of segs */

1380

/* ki_nbytes now reflect bytes instead of segs */

1374

kiocb->ki_nbytes = ret;

1381

kiocb->ki_nbytes = ret;

1375

return 0;

1382

return 0;

1376

}

1383

}

1377

1384

1378

static ssize_t aio_setup_single_vector(struct kiocb *kiocb,

1385

static ssize_t aio_setup_single_vector(struct kiocb *kiocb,

1379

int rw, char __user *buf,

1386

int rw, char __user *buf,

1380

unsigned long *nr_segs,

1387

unsigned long *nr_segs,

1381

struct iovec *iovec)

1388

struct iovec *iovec)

1382

{

1389

{

1383

if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))

1390

if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))

1384

return -EFAULT;

1391

return -EFAULT;

1385

1392

1386

iovec->iov_base = buf;

1393

iovec->iov_base = buf;

1387

iovec->iov_len = kiocb->ki_nbytes;

1394

iovec->iov_len = kiocb->ki_nbytes;

1388

*nr_segs = 1;

1395

*nr_segs = 1;

1389

return 0;

1396

return 0;

1390

}

1397

}

1391

1398

1392

/*

1399

/*

1393

* aio_run_iocb:

1400

* aio_run_iocb:

1394

* Performs the initial checks and io submission.

1401

* Performs the initial checks and io submission.

1395

*/

1402

*/

1396

static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,

1403

static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,

1397

char __user *buf, bool compat)

1404

char __user *buf, bool compat)

1398

{

1405

{

1399

struct file *file = req->ki_filp;

1406

struct file *file = req->ki_filp;

1400

ssize_t ret;

1407

ssize_t ret;

1401

unsigned long nr_segs;

1408

unsigned long nr_segs;

1402

int rw;

1409

int rw;

1403

fmode_t mode;

1410

fmode_t mode;

1404

aio_rw_op *rw_op;

1411

aio_rw_op *rw_op;

1405

rw_iter_op *iter_op;

1412

rw_iter_op *iter_op;

1406

struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;

1413

struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;

1407

struct iov_iter iter;

1414

struct iov_iter iter;

1408

1415

1409

switch (opcode) {

1416

switch (opcode) {

1410

case IOCB_CMD_PREAD:

1417

case IOCB_CMD_PREAD:

1411

case IOCB_CMD_PREADV:

1418

case IOCB_CMD_PREADV:

1412

mode = FMODE_READ;

1419

mode = FMODE_READ;

1413

rw = READ;

1420

rw = READ;

1414

rw_op = file->f_op->aio_read;

1421

rw_op = file->f_op->aio_read;

1415

iter_op = file->f_op->read_iter;

1422

iter_op = file->f_op->read_iter;

1416

goto rw_common;

1423

goto rw_common;

1417

1424

1418

case IOCB_CMD_PWRITE:

1425

case IOCB_CMD_PWRITE:

1419

case IOCB_CMD_PWRITEV:

1426

case IOCB_CMD_PWRITEV:

1420

mode = FMODE_WRITE;

1427

mode = FMODE_WRITE;

1421

rw = WRITE;

1428

rw = WRITE;

1422

rw_op = file->f_op->aio_write;

1429

rw_op = file->f_op->aio_write;

1423

iter_op = file->f_op->write_iter;

1430

iter_op = file->f_op->write_iter;

1424

goto rw_common;

1431

goto rw_common;

1425

rw_common:

1432

rw_common:

1426

if (unlikely(!(file->f_mode & mode)))

1433

if (unlikely(!(file->f_mode & mode)))

1427

return -EBADF;

1434

return -EBADF;

1428

1435

1429

if (!rw_op && !iter_op)

1436

if (!rw_op && !iter_op)

1430

return -EINVAL;

1437

return -EINVAL;

1431

1438

1432

ret = (opcode == IOCB_CMD_PREADV ||

1439

ret = (opcode == IOCB_CMD_PREADV ||

1433

opcode == IOCB_CMD_PWRITEV)

1440

opcode == IOCB_CMD_PWRITEV)

1434

? aio_setup_vectored_rw(req, rw, buf, &nr_segs,

1441

? aio_setup_vectored_rw(req, rw, buf, &nr_segs,

1435

&iovec, compat)

1442

&iovec, compat)

1436

: aio_setup_single_vector(req, rw, buf, &nr_segs,

1443

: aio_setup_single_vector(req, rw, buf, &nr_segs,

1437

iovec);

1444

iovec);

1438

if (!ret)

1445

if (!ret)

1439

ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);

1446

ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);

1440

if (ret < 0) {

1447

if (ret < 0) {

1441

if (iovec != inline_vecs)

1448

if (iovec != inline_vecs)

1442

kfree(iovec);

1449

kfree(iovec);

1443

return ret;

1450

return ret;

1444

}

1451

}

1445

1452

1446

req->ki_nbytes = ret;

1453

req->ki_nbytes = ret;

1447

1454

1448

/* XXX: move/kill - rw_verify_area()? */

1455

/* XXX: move/kill - rw_verify_area()? */

1449

/* This matches the pread()/pwrite() logic */

1456

/* This matches the pread()/pwrite() logic */

1450

if (req->ki_pos < 0) {

1457

if (req->ki_pos < 0) {

1451

ret = -EINVAL;

1458

ret = -EINVAL;

1452

break;

1459

break;

1453

}

1460

}

1454

1461

1455

if (rw == WRITE)

1462

if (rw == WRITE)

1456

file_start_write(file);

1463

file_start_write(file);

1457

1464

1458

if (iter_op) {

1465

if (iter_op) {

1459

iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);

1466

iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);

1460

ret = iter_op(req, &iter);

1467

ret = iter_op(req, &iter);

1461

} else {

1468

} else {

1462

ret = rw_op(req, iovec, nr_segs, req->ki_pos);

1469

ret = rw_op(req, iovec, nr_segs, req->ki_pos);

1463

}

1470

}

1464

1471

1465

if (rw == WRITE)

1472

if (rw == WRITE)

1466

file_end_write(file);

1473

file_end_write(file);

1467

break;

1474

break;

1468

1475

1469

case IOCB_CMD_FDSYNC:

1476

case IOCB_CMD_FDSYNC:

1470

if (!file->f_op->aio_fsync)

1477

if (!file->f_op->aio_fsync)

1471

return -EINVAL;

1478

return -EINVAL;

1472

1479

1473

ret = file->f_op->aio_fsync(req, 1);

1480

ret = file->f_op->aio_fsync(req, 1);

1474

break;

1481

break;

1475

1482

1476

case IOCB_CMD_FSYNC:

1483

case IOCB_CMD_FSYNC:

1477

if (!file->f_op->aio_fsync)

1484

if (!file->f_op->aio_fsync)

1478

return -EINVAL;

1485

return -EINVAL;

1479

1486

1480

ret = file->f_op->aio_fsync(req, 0);

1487

ret = file->f_op->aio_fsync(req, 0);

1481

break;

1488

break;

1482

1489

1483

default:

1490

default:

1484

pr_debug("EINVAL: no operation provided\n");

1491

pr_debug("EINVAL: no operation provided\n");

1485

return -EINVAL;

1492

return -EINVAL;

1486

}

1493

}

1487

1494

1488

if (iovec != inline_vecs)

1495

if (iovec != inline_vecs)

1489

kfree(iovec);

1496

kfree(iovec);

1490

1497

1491

if (ret != -EIOCBQUEUED) {

1498

if (ret != -EIOCBQUEUED) {

1492

/*

1499

/*

1493

* There's no easy way to restart the syscall since other AIO's

1500

* There's no easy way to restart the syscall since other AIO's

1494

* may be already running. Just fail this IO with EINTR.

1501

* may be already running. Just fail this IO with EINTR.

1495

*/

1502

*/

1496

if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||

1503

if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||

1497

ret == -ERESTARTNOHAND ||

1504

ret == -ERESTARTNOHAND ||

1498

ret == -ERESTART_RESTARTBLOCK))

1505

ret == -ERESTART_RESTARTBLOCK))

1499

ret = -EINTR;

1506

ret = -EINTR;

1500

aio_complete(req, ret, 0);

1507

aio_complete(req, ret, 0);

1501

}

1508

}

1502

1509

1503

return 0;

1510

return 0;

1504

}

1511

}

1505

1512

1506

static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,

1513

static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,

1507

struct iocb *iocb, bool compat)

1514

struct iocb *iocb, bool compat)

1508

{

1515

{

1509

struct kiocb *req;

1516

struct kiocb *req;

1510

ssize_t ret;

1517

ssize_t ret;

1511

1518

1512

/* enforce forwards compatibility on users */

1519

/* enforce forwards compatibility on users */

1513

if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {

1520

if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {

1514

pr_debug("EINVAL: reserve field set\n");

1521

pr_debug("EINVAL: reserve field set\n");

1515

return -EINVAL;

1522

return -EINVAL;

1516

}

1523

}

1517

1524

1518

/* prevent overflows */

1525

/* prevent overflows */

1519

if (unlikely(

1526

if (unlikely(

1520

(iocb->aio_buf != (unsigned long)iocb->aio_buf) ||

1527

(iocb->aio_buf != (unsigned long)iocb->aio_buf) ||

1521

(iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||

1528

(iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||

1522

((ssize_t)iocb->aio_nbytes < 0)

1529

((ssize_t)iocb->aio_nbytes < 0)

1523

)) {

1530

)) {

1524

pr_debug("EINVAL: io_submit: overflow check\n");

1531

pr_debug("EINVAL: io_submit: overflow check\n");

1525

return -EINVAL;

1532

return -EINVAL;

1526

}

1533

}

1527

1534

1528

req = aio_get_req(ctx);

1535

req = aio_get_req(ctx);

1529

if (unlikely(!req))

1536

if (unlikely(!req))

1530

return -EAGAIN;

1537

return -EAGAIN;

1531

1538

1532

req->ki_filp = fget(iocb->aio_fildes);

1539

req->ki_filp = fget(iocb->aio_fildes);

1533

if (unlikely(!req->ki_filp)) {

1540

if (unlikely(!req->ki_filp)) {

1534

ret = -EBADF;

1541

ret = -EBADF;

1535

goto out_put_req;

1542

goto out_put_req;

1536

}

1543

}

1537

1544

1538

if (iocb->aio_flags & IOCB_FLAG_RESFD) {

1545

if (iocb->aio_flags & IOCB_FLAG_RESFD) {

1539

/*

1546

/*

1540

* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an

1547

* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an

1541

* instance of the file* now. The file descriptor must be

1548

* instance of the file* now. The file descriptor must be

1542

* an eventfd() fd, and will be signaled for each completed

1549

* an eventfd() fd, and will be signaled for each completed

1543

* event using the eventfd_signal() function.

1550

* event using the eventfd_signal() function.

1544

*/

1551

*/

1545

req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);

1552

req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);

1546

if (IS_ERR(req->ki_eventfd)) {

1553

if (IS_ERR(req->ki_eventfd)) {

1547

ret = PTR_ERR(req->ki_eventfd);

1554

ret = PTR_ERR(req->ki_eventfd);

1548

req->ki_eventfd = NULL;

1555

req->ki_eventfd = NULL;

1549

goto out_put_req;

1556

goto out_put_req;

1550

}

1557

}

1551

}

1558

}

1552

1559

1553

ret = put_user(KIOCB_KEY, &user_iocb->aio_key);

1560

ret = put_user(KIOCB_KEY, &user_iocb->aio_key);

1554

if (unlikely(ret)) {

1561

if (unlikely(ret)) {

1555

pr_debug("EFAULT: aio_key\n");

1562

pr_debug("EFAULT: aio_key\n");

1556

goto out_put_req;

1563

goto out_put_req;

1557

}

1564

}

1558

1565

1559

req->ki_obj.user = user_iocb;

1566

req->ki_obj.user = user_iocb;

1560

req->ki_user_data = iocb->aio_data;

1567

req->ki_user_data = iocb->aio_data;

1561

req->ki_pos = iocb->aio_offset;

1568

req->ki_pos = iocb->aio_offset;

1562

req->ki_nbytes = iocb->aio_nbytes;

1569

req->ki_nbytes = iocb->aio_nbytes;

1563

1570

1564

ret = aio_run_iocb(req, iocb->aio_lio_opcode,

1571

ret = aio_run_iocb(req, iocb->aio_lio_opcode,

1565

(char __user *)(unsigned long)iocb->aio_buf,

1572

(char __user *)(unsigned long)iocb->aio_buf,

1566

compat);

1573

compat);

1567

if (ret)

1574

if (ret)

1568

goto out_put_req;

1575

goto out_put_req;

1569

1576

1570

return 0;

1577

return 0;

1571

out_put_req:

1578

out_put_req:

1572

put_reqs_available(ctx, 1);

1579

put_reqs_available(ctx, 1);

1573

percpu_ref_put(&ctx->reqs);

1580

percpu_ref_put(&ctx->reqs);

1574

kiocb_free(req);

1581

kiocb_free(req);

1575

return ret;

1582

return ret;

1576

}

1583

}

1577

1584

1578

long do_io_submit(aio_context_t ctx_id, long nr,

1585

long do_io_submit(aio_context_t ctx_id, long nr,

1579

struct iocb __user *__user *iocbpp, bool compat)

1586

struct iocb __user *__user *iocbpp, bool compat)

1580

{

1587

{

1581

struct kioctx *ctx;

1588

struct kioctx *ctx;

1582

long ret = 0;

1589

long ret = 0;

1583

int i = 0;

1590

int i = 0;

1584

struct blk_plug plug;

1591

struct blk_plug plug;

1585

1592

1586

if (unlikely(nr < 0))

1593

if (unlikely(nr < 0))

1587

return -EINVAL;

1594

return -EINVAL;

1588

1595

1589

if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))

1596

if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))

1590

nr = LONG_MAX/sizeof(*iocbpp);

1597

nr = LONG_MAX/sizeof(*iocbpp);

1591

1598

1592

if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))

1599

if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))

1593

return -EFAULT;

1600

return -EFAULT;

1594

1601

1595

ctx = lookup_ioctx(ctx_id);

1602

ctx = lookup_ioctx(ctx_id);

1596

if (unlikely(!ctx)) {

1603

if (unlikely(!ctx)) {

1597

pr_debug("EINVAL: invalid context id\n");

1604

pr_debug("EINVAL: invalid context id\n");

1598

return -EINVAL;

1605

return -EINVAL;

1599

}

1606

}

1600

1607

1601

blk_start_plug(&plug);

1608

blk_start_plug(&plug);

1602

1609

1603

/*

1610

/*

1604

* AKPM: should this return a partial result if some of the IOs were

1611

* AKPM: should this return a partial result if some of the IOs were

1605

* successfully submitted?

1612

* successfully submitted?

1606

*/

1613

*/

1607

for (i=0; i<nr; i++) {

1614

for (i=0; i<nr; i++) {

1608

struct iocb __user *user_iocb;

1615

struct iocb __user *user_iocb;

1609

struct iocb tmp;

1616

struct iocb tmp;

1610

1617

1611

if (unlikely(__get_user(user_iocb, iocbpp + i))) {

1618

if (unlikely(__get_user(user_iocb, iocbpp + i))) {

1612

ret = -EFAULT;

1619

ret = -EFAULT;

1613

break;

1620

break;

1614

}

1621

}

1615

1622

1616

if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {

1623

if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {

1617

ret = -EFAULT;

1624

ret = -EFAULT;

1618

break;

1625

break;

1619

}

1626

}

1620

1627

1621

ret = io_submit_one(ctx, user_iocb, &tmp, compat);

1628

ret = io_submit_one(ctx, user_iocb, &tmp, compat);

1622

if (ret)

1629

if (ret)

1623

break;

1630

break;

1624

}

1631

}

1625

blk_finish_plug(&plug);

1632

blk_finish_plug(&plug);

1626

1633

1627

percpu_ref_put(&ctx->users);

1634

percpu_ref_put(&ctx->users);

1628

return i ? i : ret;

1635

return i ? i : ret;

1629

}

1636

}

1630

1637

1631

/* sys_io_submit:

1638

/* sys_io_submit:

1632

* Queue the nr iocbs pointed to by iocbpp for processing. Returns

1639

* Queue the nr iocbs pointed to by iocbpp for processing. Returns

1633

* the number of iocbs queued. May return -EINVAL if the aio_context

1640

* the number of iocbs queued. May return -EINVAL if the aio_context

1634

* specified by ctx_id is invalid, if nr is < 0, if the iocb at

1641

* specified by ctx_id is invalid, if nr is < 0, if the iocb at

1635

* *iocbpp[0] is not properly initialized, if the operation specified

1642

* *iocbpp[0] is not properly initialized, if the operation specified

1636

* is invalid for the file descriptor in the iocb. May fail with

1643

* is invalid for the file descriptor in the iocb. May fail with

1637

* -EFAULT if any of the data structures point to invalid data. May

1644

* -EFAULT if any of the data structures point to invalid data. May

1638

* fail with -EBADF if the file descriptor specified in the first

1645

* fail with -EBADF if the file descriptor specified in the first

1639

* iocb is invalid. May fail with -EAGAIN if insufficient resources

1646

* iocb is invalid. May fail with -EAGAIN if insufficient resources

1640

* are available to queue any iocbs. Will return 0 if nr is 0. Will

1647

* are available to queue any iocbs. Will return 0 if nr is 0. Will

1641

* fail with -ENOSYS if not implemented.

1648

* fail with -ENOSYS if not implemented.

1642

*/

1649

*/

1643

SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,

1650

SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,

1644

struct iocb __user * __user *, iocbpp)

1651

struct iocb __user * __user *, iocbpp)

1645

{

1652

{

1646

return do_io_submit(ctx_id, nr, iocbpp, 0);

1653

return do_io_submit(ctx_id, nr, iocbpp, 0);

1647

}

1654

}

1648

1655

1649

/* lookup_kiocb

1656

/* lookup_kiocb

1650

* Finds a given iocb for cancellation.

1657

* Finds a given iocb for cancellation.

1651

*/

1658

*/

1652

static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,

1659

static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,

1653

u32 key)

1660

u32 key)

1654

{

1661

{

1655

struct list_head *pos;

1662

struct list_head *pos;

1656

1663

1657

assert_spin_locked(&ctx->ctx_lock);

1664

assert_spin_locked(&ctx->ctx_lock);

1658

1665

1659

if (key != KIOCB_KEY)

1666

if (key != KIOCB_KEY)

1660

return NULL;

1667

return NULL;

1661

1668

1662

/* TODO: use a hash or array, this sucks. */

1669

/* TODO: use a hash or array, this sucks. */

1663

list_for_each(pos, &ctx->active_reqs) {

1670

list_for_each(pos, &ctx->active_reqs) {

1664

struct kiocb *kiocb = list_kiocb(pos);

1671

struct kiocb *kiocb = list_kiocb(pos);

1665

if (kiocb->ki_obj.user == iocb)

1672

if (kiocb->ki_obj.user == iocb)

1666

return kiocb;

1673

return kiocb;

1667

}

1674

}

1668

return NULL;

1675

return NULL;

1669

}

1676

}

1670

1677

1671

/* sys_io_cancel:

1678

/* sys_io_cancel:

1672

* Attempts to cancel an iocb previously passed to io_submit. If

1679

* Attempts to cancel an iocb previously passed to io_submit. If

1673

* the operation is successfully cancelled, the resulting event is

1680

* the operation is successfully cancelled, the resulting event is

1674

* copied into the memory pointed to by result without being placed

1681

* copied into the memory pointed to by result without being placed

1675

* into the completion queue and 0 is returned. May fail with

1682

* into the completion queue and 0 is returned. May fail with

1676

* -EFAULT if any of the data structures pointed to are invalid.

1683

* -EFAULT if any of the data structures pointed to are invalid.

1677

* May fail with -EINVAL if aio_context specified by ctx_id is

1684

* May fail with -EINVAL if aio_context specified by ctx_id is

1678

* invalid. May fail with -EAGAIN if the iocb specified was not

1685

* invalid. May fail with -EAGAIN if the iocb specified was not

1679

* cancelled. Will fail with -ENOSYS if not implemented.

1686

* cancelled. Will fail with -ENOSYS if not implemented.

1680

*/

1687

*/

1681

SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,

1688

SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,

1682

struct io_event __user *, result)

1689

struct io_event __user *, result)

1683

{

1690

{

1684

struct kioctx *ctx;

1691

struct kioctx *ctx;

1685

struct kiocb *kiocb;

1692

struct kiocb *kiocb;

1686

u32 key;

1693

u32 key;

1687

int ret;

1694

int ret;

1688

1695

1689

ret = get_user(key, &iocb->aio_key);

1696

ret = get_user(key, &iocb->aio_key);

1690

if (unlikely(ret))

1697

if (unlikely(ret))

1691

return -EFAULT;

1698

return -EFAULT;

1692

1699

1693

ctx = lookup_ioctx(ctx_id);

1700

ctx = lookup_ioctx(ctx_id);

1694

if (unlikely(!ctx))

1701

if (unlikely(!ctx))

1695

return -EINVAL;

1702

return -EINVAL;

1696

1703

1697

spin_lock_irq(&ctx->ctx_lock);

1704

spin_lock_irq(&ctx->ctx_lock);

1698

1705

1699

kiocb = lookup_kiocb(ctx, iocb, key);

1706

kiocb = lookup_kiocb(ctx, iocb, key);

1700

if (kiocb)

1707

if (kiocb)

1701

ret = kiocb_cancel(kiocb);

1708

ret = kiocb_cancel(kiocb);

1702

else

1709

else

1703

ret = -EINVAL;

1710

ret = -EINVAL;

1704

1711

1705

spin_unlock_irq(&ctx->ctx_lock);

1712

spin_unlock_irq(&ctx->ctx_lock);

1706

1713

1707

if (!ret) {

1714

if (!ret) {

1708

/*

1715

/*

1709

* The result argument is no longer used - the io_event is

1716

* The result argument is no longer used - the io_event is

1710

* always delivered via the ring buffer. -EINPROGRESS indicates

1717

* always delivered via the ring buffer. -EINPROGRESS indicates

1711

* cancellation is progress:

1718

* cancellation is progress:

1712

*/

1719

*/

1713

ret = -EINPROGRESS;

1720

ret = -EINPROGRESS;

1714

}

1721

}

1715

1722

1716

percpu_ref_put(&ctx->users);

1723

percpu_ref_put(&ctx->users);

1717

1724

1718

return ret;

1725

return ret;

1719

}

1726

}

1720

1727

1721

/* io_getevents:

1728

/* io_getevents:

1722

* Attempts to read at least min_nr events and up to nr events from

1729

* Attempts to read at least min_nr events and up to nr events from

1723

* the completion queue for the aio_context specified by ctx_id. If

1730

* the completion queue for the aio_context specified by ctx_id. If

1724

* it succeeds, the number of read events is returned. May fail with

1731

* it succeeds, the number of read events is returned. May fail with

1725

* -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is

1732

* -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is

1726

* out of range, if timeout is out of range. May fail with -EFAULT

1733

* out of range, if timeout is out of range. May fail with -EFAULT

1727

* if any of the memory specified is invalid. May return 0 or

1734

* if any of the memory specified is invalid. May return 0 or

1728

* < min_nr if the timeout specified by timeout has elapsed

1735

* < min_nr if the timeout specified by timeout has elapsed

1729

* before sufficient events are available, where timeout == NULL

1736

* before sufficient events are available, where timeout == NULL

1730

* specifies an infinite timeout. Note that the timeout pointed to by

1737

* specifies an infinite timeout. Note that the timeout pointed to by

1731

* timeout is relative. Will fail with -ENOSYS if not implemented.

1738

* timeout is relative. Will fail with -ENOSYS if not implemented.

1732

*/

1739

*/

1733

SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,

1740

SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,

1734

long, min_nr,

1741

long, min_nr,

1735

long, nr,

1742

long, nr,

1736

struct io_event __user *, events,

1743

struct io_event __user *, events,

1737

struct timespec __user *, timeout)

1744

struct timespec __user *, timeout)

1738

{

1745

{

1739

struct kioctx *ioctx = lookup_ioctx(ctx_id);

1746

struct kioctx *ioctx = lookup_ioctx(ctx_id);

1740

long ret = -EINVAL;

1747

long ret = -EINVAL;

1741

1748

1742

if (likely(ioctx)) {

1749

if (likely(ioctx)) {

1743

if (likely(min_nr <= nr && min_nr >= 0))

1750

if (likely(min_nr <= nr && min_nr >= 0))

1744

ret = read_events(ioctx, min_nr, nr, events, timeout);

1751

ret = read_events(ioctx, min_nr, nr, events, timeout);

1745

percpu_ref_put(&ioctx->users);

1752

percpu_ref_put(&ioctx->users);

1746

}

1753

}

1747

return ret;

1754

return ret;

1748

}

1755

}

1749

1756

GITLAB

Merge git://git.kvack.org/~bcrl/aio-fixes

 /*
  *	An async IO implementation for Linux
  *	Written by Benjamin LaHaise <bcrl@kvack.org>
  *
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
  *
  *	See ../COPYING for licensing terms.
  */
 #define pr_fmt(fmt) "%s: " fmt, __func__
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/aio_abi.h>
 #include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmu_context.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <linux/migrate.h>
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/mount.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
 struct aio_ring {
 	unsigned	id;	/* kernel internal index number */
 	unsigned	nr;	/* number of io_events */
 	unsigned	head;	/* Written to by userland or under ring_lock
 				 * mutex by aio_read_events_ring(). */
 	unsigned	tail;
 	unsigned	magic;
 	unsigned	compat_features;
 	unsigned	incompat_features;
 	unsigned	header_length;	/* size of aio_ring */
 	struct io_event		io_events[0];
 }; /* 128 bytes + ring size */
 #define AIO_RING_PAGES	8
 struct kioctx_table {
 	struct rcu_head	rcu;
 	unsigned	nr;
 	struct kioctx	*table[];
 };
 struct kioctx_cpu {
 	unsigned		reqs_available;
 };
 struct kioctx {
 	struct percpu_ref	users;
 	atomic_t		dead;
 	struct percpu_ref	reqs;
 	unsigned long		user_id;
 	struct __percpu kioctx_cpu *cpu;
 	/*
 	 * For percpu reqs_available, number of slots we move to/from global
 	 * counter at a time:
 	 */
 	unsigned		req_batch;
 	/*
 	 * This is what userspace passed to io_setup(), it's not used for
 	 * anything but counting against the global max_reqs quota.
 	 *
 	 * The real limit is nr_events - 1, which will be larger (see
 	 * aio_setup_ring())
 	 */
 	unsigned		max_reqs;
 	/* Size of ringbuffer, in units of struct io_event */
 	unsigned		nr_events;
 	unsigned long		mmap_base;
 	unsigned long		mmap_size;
 	struct page		**ring_pages;
 	long			nr_pages;
 	struct work_struct	free_work;
 	/*
 	 * signals when all in-flight requests are done
 	 */
 	struct completion *requests_done;
 	struct {
 		/*
 		 * This counts the number of available slots in the ringbuffer,
 		 * so we avoid overflowing it: it's decremented (if positive)
 		 * when allocating a kiocb and incremented when the resulting
 		 * io_event is pulled off the ringbuffer.
 		 *
 		 * We batch accesses to it with a percpu version.
 		 */
 		atomic_t	reqs_available;
 	} ____cacheline_aligned_in_smp;
 	struct {
 		spinlock_t	ctx_lock;
 		struct list_head active_reqs;	/* used for cancellation */
 	} ____cacheline_aligned_in_smp;
 	struct {
 		struct mutex	ring_lock;
 		wait_queue_head_t wait;
 	} ____cacheline_aligned_in_smp;
 	struct {
 		unsigned	tail;
 		unsigned	completed_events;
 		spinlock_t	completion_lock;
 	} ____cacheline_aligned_in_smp;
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
 	unsigned		id;
 };
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;		/* current system wide number of aio requests */
 unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
 /*----end sysctl variables---*/
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 /* Backing dev info for aio fs.
  * -no dirty page accounting or writeback happens
  */
 static struct backing_dev_info aio_fs_backing_dev_info = {
 	.name           = "aiofs",
 	.state          = 0,
 	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
 };
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
 	struct qstr this = QSTR_INIT("[aio]", 5);
 	struct file *file;
 	struct path path;
 	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
 	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
 	if (!path.dentry) {
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
 	path.mnt = mntget(aio_mnt);
 	d_instantiate(path.dentry, inode);
 	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
 	if (IS_ERR(file)) {
 		path_put(&path);
 		return file;
 	}
 	file->f_flags = O_RDWR;
 	return file;
 }
 static struct dentry *aio_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
 	static const struct dentry_operations ops = {
 		.d_dname	= simple_dname,
 	};
 	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
 }
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
  *	failure as this is done early during the boot sequence.
  */
 static int __init aio_setup(void)
 {
 	static struct file_system_type aio_fs = {
 		.name		= "aio",
 		.mount		= aio_mount,
 		.kill_sb	= kill_anon_super,
 	};
 	aio_mnt = kern_mount(&aio_fs);
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 	if (bdi_init(&aio_fs_backing_dev_info))
 		panic("Failed to init aio fs backing dev info.");
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
 	return 0;
 }
 __initcall(aio_setup);
 static void put_aio_ring_file(struct kioctx *ctx)
 {
 	struct file *aio_ring_file = ctx->aio_ring_file;
 	if (aio_ring_file) {
 		truncate_setsize(aio_ring_file->f_inode, 0);
 		/* Prevent further access to the kioctx from migratepages */
 		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
 		aio_ring_file->f_inode->i_mapping->private_data = NULL;
 		ctx->aio_ring_file = NULL;
 		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
 		fput(aio_ring_file);
 	}
 }
 static void aio_free_ring(struct kioctx *ctx)
 {
 	int i;
 	/* Disconnect the kiotx from the ring file.  This prevents future
 	 * accesses to the kioctx from page migration.
 	 */
 	put_aio_ring_file(ctx);
 	for (i = 0; i < ctx->nr_pages; i++) {
 		struct page *page;
 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
 				page_count(ctx->ring_pages[i]));
 		page = ctx->ring_pages[i];
 		if (!page)
 			continue;
 		ctx->ring_pages[i] = NULL;
 		put_page(page);
 	}
 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
 		kfree(ctx->ring_pages);
 		ctx->ring_pages = NULL;
 	}
 }
 static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	vma->vm_flags |= VM_DONTEXPAND;
 	vma->vm_ops = &generic_file_vm_ops;
 	return 0;
 }
 static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct kioctx_table *table;
 	int i;
 	spin_lock(&mm->ioctx_lock);
 	rcu_read_lock();
 	table = rcu_dereference(mm->ioctx_table);
 	for (i = 0; i < table->nr; i++) {
 		struct kioctx *ctx;
 		ctx = table->table[i];
 		if (ctx && ctx->aio_ring_file == file) {
 			ctx->user_id = ctx->mmap_base = vma->vm_start;
 			break;
 		}
 	}
 	rcu_read_unlock();
 	spin_unlock(&mm->ioctx_lock);
 }
 static const struct file_operations aio_ring_fops = {
 	.mmap = aio_ring_mmap,
 	.mremap = aio_ring_remap,
 };
 #if IS_ENABLED(CONFIG_MIGRATION)
 static int aio_migratepage(struct address_space *mapping, struct page *new,
 			struct page *old, enum migrate_mode mode)
 {
 	struct kioctx *ctx;
 	unsigned long flags;
 	pgoff_t idx;
 	int rc;
 	rc = 0;
 	/* mapping->private_lock here protects against the kioctx teardown.  */
 	spin_lock(&mapping->private_lock);
 	ctx = mapping->private_data;
 	if (!ctx) {
 		rc = -EINVAL;
 		goto out;
 	}
 	/* The ring_lock mutex.  The prevents aio_read_events() from writing
 	 * to the ring's head, and prevents page migration from mucking in
 	 * a partially initialized kiotx.
 	 */
 	if (!mutex_trylock(&ctx->ring_lock)) {
 		rc = -EAGAIN;
 		goto out;
 	}
 	idx = old->index;
 	if (idx < (pgoff_t)ctx->nr_pages) {
 		/* Make sure the old page hasn't already been changed */
 		if (ctx->ring_pages[idx] != old)
 			rc = -EAGAIN;
 	} else
 		rc = -EINVAL;
 	if (rc != 0)
 		goto out_unlock;
 	/* Writeback must be complete */
 	BUG_ON(PageWriteback(old));
 	get_page(new);
 	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
 		goto out_unlock;
 	}
 	/* Take completion_lock to prevent other writes to the ring buffer
 	 * while the old page is copied to the new.  This prevents new
 	 * events from being lost.
 	 */
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	migrate_page_copy(new, old);
 	BUG_ON(ctx->ring_pages[idx] != old);
 	ctx->ring_pages[idx] = new;
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	/* The old page is no longer accessible. */
 	put_page(old);
 out_unlock:
 	mutex_unlock(&ctx->ring_lock);
 out:
 	spin_unlock(&mapping->private_lock);
 	return rc;
 }
 #endif
 static const struct address_space_operations aio_ctx_aops = {
 	.set_page_dirty = __set_page_dirty_no_writeback,
 #if IS_ENABLED(CONFIG_MIGRATION)
 	.migratepage	= aio_migratepage,
 #endif
 };
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
 	unsigned nr_events = ctx->max_reqs;
 	struct mm_struct *mm = current->mm;
 	unsigned long size, unused;
 	int nr_pages;
 	int i;
 	struct file *file;
 	/* Compensate for the ring buffer's head/tail overlap entry */
 	nr_events += 2;	/* 1 is required, 2 for good luck */
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
 	nr_pages = PFN_UP(size);
 	if (nr_pages < 0)
 		return -EINVAL;
 	file = aio_private_file(ctx, nr_pages);
 	if (IS_ERR(file)) {
 		ctx->aio_ring_file = NULL;
 		return -ENOMEM;
 	}
 	ctx->aio_ring_file = file;
 	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
 			/ sizeof(struct io_event);
 	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
 		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
 					  GFP_KERNEL);
 		if (!ctx->ring_pages) {
 			put_aio_ring_file(ctx);
 			return -ENOMEM;
 		}
 	}
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(file->f_inode->i_mapping,
 					   i, GFP_HIGHUSER | __GFP_ZERO);
 		if (!page)
 			break;
 		pr_debug("pid(%d) page[%d]->count=%d\n",
 			 current->pid, i, page_count(page));
 		SetPageUptodate(page);
 		unlock_page(page);
 		ctx->ring_pages[i] = page;
 	}
 	ctx->nr_pages = i;
 	if (unlikely(i != nr_pages)) {
 		aio_free_ring(ctx);
 		return -ENOMEM;
 	}
 	ctx->mmap_size = nr_pages * PAGE_SIZE;
 	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
 	down_write(&mm->mmap_sem);
 	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
 				       PROT_READ | PROT_WRITE,
 				       MAP_SHARED, 0, &unused);
 	up_write(&mm->mmap_sem);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -ENOMEM;
 	}
 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
 	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
 	ring->id = ~0U;
 	ring->head = ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
 	ring->header_length = sizeof(struct aio_ring);
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 	return 0;
 }
 #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 {
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	if (!req->ki_list.next)
 		list_add(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 static int kiocb_cancel(struct kiocb *kiocb)
 {
 	kiocb_cancel_fn *old, *cancel;
 	/*
 	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
 	 * actually has a cancel function, hence the cmpxchg()
 	 */
 	cancel = ACCESS_ONCE(kiocb->ki_cancel);
 	do {
 		if (!cancel || cancel == KIOCB_CANCELLED)
 			return -EINVAL;
 		old = cancel;
 		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
 	} while (cancel != old);
 	return cancel(kiocb);
 }
 static void free_ioctx(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
 	pr_debug("freeing %p\n", ctx);
 	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
 	percpu_ref_exit(&ctx->users);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
 	/* At this point we know that there are no any in-flight requests */
 	if (ctx->requests_done)
 		complete(ctx->requests_done);
 	INIT_WORK(&ctx->free_work, free_ioctx);
 	schedule_work(&ctx->free_work);
 }
 /*
  * When this function runs, the kioctx has been removed from the "hash table"
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
 static void free_ioctx_users(struct percpu_ref *ref)
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct kiocb *req;
 	spin_lock_irq(&ctx->ctx_lock);
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct kiocb, ki_list);
 		list_del_init(&req->ki_list);
 		kiocb_cancel(req);
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
 	percpu_ref_kill(&ctx->reqs);
 	percpu_ref_put(&ctx->reqs);
 }
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;
 	struct kioctx_table *table, *old;
 	struct aio_ring *ring;
 	spin_lock(&mm->ioctx_lock);
 	table = rcu_dereference_raw(mm->ioctx_table);
 	while (1) {
 		if (table)
 			for (i = 0; i < table->nr; i++)
 				if (!table->table[i]) {
 					ctx->id = i;
 					table->table[i] = ctx;
 					spin_unlock(&mm->ioctx_lock);
 					/* While kioctx setup is in progress,
 					 * we are protected from page migration
 					 * changes ring_pages by ->ring_lock.
 					 */
 					ring = kmap_atomic(ctx->ring_pages[0]);
 					ring->id = ctx->id;
 					kunmap_atomic(ring);
 					return 0;
 				}
 		new_nr = (table ? table->nr : 1) * 4;
 		spin_unlock(&mm->ioctx_lock);
 		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
 				new_nr, GFP_KERNEL);
 		if (!table)
 			return -ENOMEM;
 		table->nr = new_nr;
 		spin_lock(&mm->ioctx_lock);
 		old = rcu_dereference_raw(mm->ioctx_table);
 		if (!old) {
 			rcu_assign_pointer(mm->ioctx_table, table);
 		} else if (table->nr > old->nr) {
 			memcpy(table->table, old->table,
 			       old->nr * sizeof(struct kioctx *));
 			rcu_assign_pointer(mm->ioctx_table, table);
 			kfree_rcu(old, rcu);
 		} else {
 			kfree(table);
 			table = old;
 		}
 	}
 }
 static void aio_nr_sub(unsigned nr)
 {
 	spin_lock(&aio_nr_lock);
 	if (WARN_ON(aio_nr - nr > aio_nr))
 		aio_nr = 0;
 	else
 		aio_nr -= nr;
 	spin_unlock(&aio_nr_lock);
 }
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
  */
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 	/*
 	 * We keep track of the number of available ringbuffer slots, to prevent
 	 * overflow (reqs_available), and we also use percpu counters for this.
 	 *
 	 * So since up to half the slots might be on other cpu's percpu counters
 	 * and unavailable, double nr_events so userspace sees what they
 	 * expected: additionally, we move req_batch slots to/from percpu
 	 * counters at a time, so make sure that isn't 0:
 	 */
 	nr_events = max(nr_events, num_possible_cpus() * 4);
 	nr_events *= 2;
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
 	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
 		pr_debug("ENOMEM: nr_events too high\n");
 		return ERR_PTR(-EINVAL);
 	}
 	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
 		return ERR_PTR(-EAGAIN);
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 	ctx->max_reqs = nr_events;
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_lock);
 	/* Protect against page migration throughout kiotx setup by keeping
 	 * the ring_lock mutex held until setup is complete. */
 	mutex_lock(&ctx->ring_lock);
 	init_waitqueue_head(&ctx->wait);
 	INIT_LIST_HEAD(&ctx->active_reqs);
 	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
 		goto err;
 	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
 		goto err;
 	ctx->cpu = alloc_percpu(struct kioctx_cpu);
 	if (!ctx->cpu)
 		goto err;
 	err = aio_setup_ring(ctx);
 	if (err < 0)
 		goto err;
 	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
 	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
 	if (ctx->req_batch < 1)
 		ctx->req_batch = 1;
 	/* limit the number of system wide aios */
 	spin_lock(&aio_nr_lock);
 	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;
 	}
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
 	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
 	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
 	err = ioctx_add_table(ctx, mm);
 	if (err)
 		goto err_cleanup;
 	/* Release the ring_lock mutex now that all setup is complete. */
 	mutex_unlock(&ctx->ring_lock);
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
 err_cleanup:
 	aio_nr_sub(ctx->max_reqs);
 err_ctx:
 	aio_free_ring(ctx);
 err:
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
 	percpu_ref_exit(&ctx->users);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
 }
 /* kill_ioctx
  *	Cancels all outstanding aio requests on an aio context.  Used
  *	when the processes owning a context have all exited to encourage
  *	the rapid destruction of the kioctx.
  */
 static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 		struct completion *requests_done)
 {
 	struct kioctx_table *table;
 	if (atomic_xchg(&ctx->dead, 1))
 		return -EINVAL;
 	spin_lock(&mm->ioctx_lock);
 	table = rcu_dereference_raw(mm->ioctx_table);
 	WARN_ON(ctx != table->table[ctx->id]);
 	table->table[ctx->id] = NULL;
 	spin_unlock(&mm->ioctx_lock);
 	/* percpu_ref_kill() will do the necessary call_rcu() */
 	wake_up_all(&ctx->wait);
 	/*
 	 * It'd be more correct to do this in free_ioctx(), after all
 	 * the outstanding kiocbs have finished - but by then io_destroy
 	 * has already returned, so io_setup() could potentially return
 	 * -EAGAIN with no ioctxs actually in use (as far as userspace
 	 *  could tell).
 	 */
 	aio_nr_sub(ctx->max_reqs);
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	ctx->requests_done = requests_done;
 	percpu_ref_kill(&ctx->users);
 	return 0;
 }
 /* wait_on_sync_kiocb:
  *	Waits on the given sync kiocb to complete.
  */
 ssize_t wait_on_sync_kiocb(struct kiocb *req)
 {
 	while (!req->ki_ctx) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (req->ki_ctx)
 			break;
 		io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
 	return req->ki_user_data;
 }
 EXPORT_SYMBOL(wait_on_sync_kiocb);
 /*
  * exit_aio: called when the last user of mm goes away.  At this point, there is
  * no way for any new requests to be submited or any of the io_* syscalls to be
  * called on the context.
  *
  * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
  * them.
  */
 void exit_aio(struct mm_struct *mm)
 {
 	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
 	int i;
 	if (!table)
 		return;
 	for (i = 0; i < table->nr; ++i) {
 		struct kioctx *ctx = table->table[i];
 		struct completion requests_done =
 			COMPLETION_INITIALIZER_ONSTACK(requests_done);
 		if (!ctx)
 			continue;
 		/*
 		 * We don't need to bother with munmap() here - exit_mmap(mm)
 		 * is coming and it'll unmap everything. And we simply can't,
 		 * this is not necessarily our ->mm.
 		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
 		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
 		kill_ioctx(mm, ctx, &requests_done);
 		/* Wait until all IO for the context are done. */
 		wait_for_completion(&requests_done);
 	}
 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
 	kfree(table);
 }
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 {
 	struct kioctx_cpu *kcpu;
 	unsigned long flags;
 	local_irq_save(flags);
 	kcpu = this_cpu_ptr(ctx->cpu);
 	kcpu->reqs_available += nr;
 	while (kcpu->reqs_available >= ctx->req_batch * 2) {
 		kcpu->reqs_available -= ctx->req_batch;
 		atomic_add(ctx->req_batch, &ctx->reqs_available);
 	}
 	local_irq_restore(flags);
 }
 static bool get_reqs_available(struct kioctx *ctx)
 {
 	struct kioctx_cpu *kcpu;
 	bool ret = false;
 	unsigned long flags;
 	local_irq_save(flags);
 	kcpu = this_cpu_ptr(ctx->cpu);
 	if (!kcpu->reqs_available) {
 		int old, avail = atomic_read(&ctx->reqs_available);
 		do {
 			if (avail < ctx->req_batch)
 				goto out;
 			old = avail;
 			avail = atomic_cmpxchg(&ctx->reqs_available,
 					       avail, avail - ctx->req_batch);
 		} while (avail != old);
 		kcpu->reqs_available += ctx->req_batch;
 	}
 	ret = true;
 	kcpu->reqs_available--;
 out:
 	local_irq_restore(flags);
 	return ret;
 }
 /* refill_reqs_available
  *	Updates the reqs_available reference counts used for tracking the
  *	number of free slots in the completion ring.  This can be called
  *	from aio_complete() (to optimistically update reqs_available) or
  *	from aio_get_req() (the we're out of events case).  It must be
  *	called holding ctx->completion_lock.
  */
 static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                   unsigned tail)
 {
 	unsigned events_in_ring, completed;
 	/* Clamp head since userland can write to it. */
 	head %= ctx->nr_events;
 	if (head <= tail)
 		events_in_ring = tail - head;
 	else
 		events_in_ring = ctx->nr_events - (head - tail);
 	completed = ctx->completed_events;
 	if (events_in_ring < completed)
 		completed -= events_in_ring;
 	else
 		completed = 0;
 	if (!completed)
 		return;
 	ctx->completed_events -= completed;
 	put_reqs_available(ctx, completed);
 }
 /* user_refill_reqs_available
  *	Called to refill reqs_available when aio_get_req() encounters an
  *	out of space in the completion ring.
  */
 static void user_refill_reqs_available(struct kioctx *ctx)
 {
 	spin_lock_irq(&ctx->completion_lock);
 	if (ctx->completed_events) {
 		struct aio_ring *ring;
 		unsigned head;
 		/* Access of ring->head may race with aio_read_events_ring()
 		 * here, but that's okay since whether we read the old version
 		 * or the new version, and either will be valid.  The important
 		 * part is that head cannot pass tail since we prevent
 		 * aio_complete() from updating tail by holding
 		 * ctx->completion_lock.  Even if head is invalid, the check
 		 * against ctx->completed_events below will make sure we do the
 		 * safe/right thing.
 		 */
 		ring = kmap_atomic(ctx->ring_pages[0]);
 		head = ring->head;
 		kunmap_atomic(ring);
 		refill_reqs_available(ctx, head, ctx->tail);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 }
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
  */
 static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req;
 	if (!get_reqs_available(ctx)) {
 		user_refill_reqs_available(ctx);
 		if (!get_reqs_available(ctx))
 			return NULL;
 	}
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
 		goto out_put;
 	percpu_ref_get(&ctx->reqs);
 	req->ki_ctx = ctx;
 	return req;
 out_put:
 	put_reqs_available(ctx, 1);
 	return NULL;
 }
 static void kiocb_free(struct kiocb *req)
 {
 	if (req->ki_filp)
 		fput(req->ki_filp);
 	if (req->ki_eventfd != NULL)
 		eventfd_ctx_put(req->ki_eventfd);
 	kmem_cache_free(kiocb_cachep, req);
 }
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
 	struct kioctx_table *table;
 	unsigned id;
 	if (get_user(id, &ring->id))
 		return NULL;
 	rcu_read_lock();
 	table = rcu_dereference(mm->ioctx_table);
 	if (!table || id >= table->nr)
 		goto out;
 	ctx = table->table[id];
 	if (ctx && ctx->user_id == ctx_id) {
 		percpu_ref_get(&ctx->users);
 		ret = ctx;
 	}
 out:
 	rcu_read_unlock();
 	return ret;
 }
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
 void aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
 	unsigned tail, pos, head;
 	unsigned long	flags;
 	/*
 	 * Special case handling for sync iocbs:
 	 *  - events go directly into the iocb for fast handling
 	 *  - the sync task with the iocb in its stack holds the single iocb
 	 *    ref, no other paths have a way to get another ref
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
 		iocb->ki_user_data = res;
 		smp_wmb();
 		iocb->ki_ctx = ERR_PTR(-EXDEV);
 		wake_up_process(iocb->ki_obj.tsk);
 		return;
 	}
 	if (iocb->ki_list.next) {
 		unsigned long flags;
 		spin_lock_irqsave(&ctx->ctx_lock, flags);
 		list_del(&iocb->ki_list);
 		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	}
 	/*
 	 * Add a completion event to the ring buffer. Must be done holding
 	 * ctx->completion_lock to prevent other code from messing with the tail
 	 * pointer since we might be called from irq context.
 	 */
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	tail = ctx->tail;
 	pos = tail + AIO_EVENTS_OFFSET;
 	if (++tail >= ctx->nr_events)
 		tail = 0;
 	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
 	event->res = res;
 	event->res2 = res2;
 	kunmap_atomic(ev_page);
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
 		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
 		 res, res2);
 	/* after flagging the request as done, we
 	 * must never even look at it again
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 	ctx->tail = tail;
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	head = ring->head;
 	ring->tail = tail;
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 	ctx->completed_events++;
 	if (ctx->completed_events > 1)
 		refill_reqs_available(ctx, head, tail);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 	/*
 	 * Check if the user asked us to deliver the result through an
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
 	if (iocb->ki_eventfd != NULL)
 		eventfd_signal(iocb->ki_eventfd, 1);
 	/* everything turned out well, dispose of the aiocb. */
 	kiocb_free(iocb);
 	/*
 	 * We have to order our ring_info tail store above and test
 	 * of the wait list below outside the wait lock.  This is
 	 * like in wake_up_bit() where clearing a bit has to be
 	 * ordered with the unlocked test.
 	 */
 	smp_mb();
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 	percpu_ref_put(&ctx->reqs);
 }
 EXPORT_SYMBOL(aio_complete);
 /* aio_read_events_ring
  *	Pull an event off of the ioctx's event ring.  Returns the number of
  *	events fetched
  */
 static long aio_read_events_ring(struct kioctx *ctx,
 				 struct io_event __user *event, long nr)
 {
 	struct aio_ring *ring;
 	unsigned head, tail, pos;
 	long ret = 0;
 	int copy_ret;
+	/*
+	 * The mutex can block and wake us up and that will cause
+	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
+	 * and repeat. This should be rare enough that it doesn't cause
+	 * peformance issues. See the comment in read_events() for more detail.
+	 */
+	sched_annotate_sleep();
 	mutex_lock(&ctx->ring_lock);
 	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	head = ring->head;
 	tail = ring->tail;
 	kunmap_atomic(ring);
 	/*
 	 * Ensure that once we've read the current tail pointer, that
 	 * we also see the events that were stored up to the tail.
 	 */
 	smp_rmb();
 	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 	if (head == tail)
 		goto out;
 	head %= ctx->nr_events;
 	tail %= ctx->nr_events;
 	while (ret < nr) {
 		long avail;
 		struct io_event *ev;
 		struct page *page;
 		avail = (head <= tail ?  tail : ctx->nr_events) - head;
 		if (head == tail)
 			break;
 		avail = min(avail, nr - ret);
 		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
 			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
 		pos = head + AIO_EVENTS_OFFSET;
 		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
 		pos %= AIO_EVENTS_PER_PAGE;
 		ev = kmap(page);
 		copy_ret = copy_to_user(event + ret, ev + pos,
 					sizeof(*ev) * avail);
 		kunmap(page);
 		if (unlikely(copy_ret)) {
 			ret = -EFAULT;
 			goto out;
 		}
 		ret += avail;
 		head += avail;
 		head %= ctx->nr_events;
 	}
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->head = head;
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 	pr_debug("%li  h%u t%u\n", ret, head, tail);
 out:
 	mutex_unlock(&ctx->ring_lock);
 	return ret;
 }
 static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
 			    struct io_event __user *event, long *i)
 {
 	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
 	if (ret > 0)
 		*i += ret;
 	if (unlikely(atomic_read(&ctx->dead)))
 		ret = -EINVAL;
 	if (!*i)
 		*i = ret;
 	return ret < 0 || *i >= min_nr;
 }
 static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
 			struct timespec __user *timeout)
 {
 	ktime_t until = { .tv64 = KTIME_MAX };
 	long ret = 0;
 	if (timeout) {
 		struct timespec	ts;
 		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
 			return -EFAULT;
 		until = timespec_to_ktime(ts);
 	}
 	/*
 	 * Note that aio_read_events() is being called as the conditional - i.e.
 	 * we're calling it after prepare_to_wait() has set task state to
 	 * TASK_INTERRUPTIBLE.
 	 *
 	 * But aio_read_events() can block, and if it blocks it's going to flip
 	 * the task state back to TASK_RUNNING.
 	 *
 	 * This should be ok, provided it doesn't flip the state back to
 	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
 	 * will only happen if the mutex_lock() call blocks, and we then find
 	 * the ringbuffer empty. So in practice we should be ok, but it's
 	 * something to be aware of when touching this code.
 	 */
 	if (until.tv64 == 0)
 		aio_read_events(ctx, min_nr, nr, event, &ret);
 	else
 		wait_event_interruptible_hrtimeout(ctx->wait,
 				aio_read_events(ctx, min_nr, nr, event, &ret),
 				until);
 	if (!ret && signal_pending(current))
 		ret = -EINTR;
 	return ret;
 }
 /* sys_io_setup:
  *	Create an aio_context capable of receiving at least nr_events.
  *	ctxp must not point to an aio_context that already exists, and
  *	must be initialized to 0 prior to the call.  On successful
  *	creation of the aio_context, *ctxp is filled in with the resulting
  *	handle.  May fail with -EINVAL if *ctxp is not initialized,
  *	if the specified nr_events exceeds internal limits.  May fail
  *	with -EAGAIN if the specified nr_events exceeds the user's limit
  *	of available events.  May fail with -ENOMEM if insufficient kernel
  *	resources are available.  May fail with -EFAULT if an invalid
  *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
  *	implemented.
  */
 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
 	struct kioctx *ioctx = NULL;
 	unsigned long ctx;
 	long ret;
 	ret = get_user(ctx, ctxp);
 	if (unlikely(ret))
 		goto out;
 	ret = -EINVAL;
 	if (unlikely(ctx || nr_events == 0)) {
 		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
 		         ctx, nr_events);
 		goto out;
 	}
 	ioctx = ioctx_alloc(nr_events);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
 		if (ret)
 			kill_ioctx(current->mm, ioctx, NULL);
 		percpu_ref_put(&ioctx->users);
 	}
 out:
 	return ret;
 }
 /* sys_io_destroy:
  *	Destroy the aio_context specified.  May cancel any outstanding
  *	AIOs and block on completion.  Will fail with -ENOSYS if not
  *	implemented.  May fail with -EINVAL if the context pointed to
  *	is invalid.
  */
 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
 		struct completion requests_done =
 			COMPLETION_INITIALIZER_ONSTACK(requests_done);
 		int ret;
 		/* Pass requests_done to kill_ioctx() where it can be set
 		 * in a thread-safe way. If we try to set it here then we have
 		 * a race condition if two io_destroy() called simultaneously.
 		 */
 		ret = kill_ioctx(current->mm, ioctx, &requests_done);
 		percpu_ref_put(&ioctx->users);
 		/* Wait until all IO for the context are done. Otherwise kernel
 		 * keep using user-space buffers even if user thinks the context
 		 * is destroyed.
 		 */
 		if (!ret)
 			wait_for_completion(&requests_done);
 		return ret;
 	}
 	pr_debug("EINVAL: io_destroy: invalid context id\n");
 	return -EINVAL;
 }
 typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
 			    unsigned long, loff_t);
 typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
 static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 				     int rw, char __user *buf,
 				     unsigned long *nr_segs,
 				     struct iovec **iovec,
 				     bool compat)
 {
 	ssize_t ret;
 	*nr_segs = kiocb->ki_nbytes;
 #ifdef CONFIG_COMPAT
 	if (compat)
 		ret = compat_rw_copy_check_uvector(rw,
 				(struct compat_iovec __user *)buf,
 				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	else
 #endif
 		ret = rw_copy_check_uvector(rw,
 				(struct iovec __user *)buf,
 				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	if (ret < 0)
 		return ret;
 	/* ki_nbytes now reflect bytes instead of segs */
 	kiocb->ki_nbytes = ret;
 	return 0;
 }
 static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 				       int rw, char __user *buf,
 				       unsigned long *nr_segs,
 				       struct iovec *iovec)
 {
 	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
 		return -EFAULT;
 	iovec->iov_base = buf;
 	iovec->iov_len = kiocb->ki_nbytes;
 	*nr_segs = 1;
 	return 0;
 }
 /*
  * aio_run_iocb:
  *	Performs the initial checks and io submission.
  */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 			    char __user *buf, bool compat)
 {
 	struct file *file = req->ki_filp;
 	ssize_t ret;
 	unsigned long nr_segs;
 	int rw;
 	fmode_t mode;
 	aio_rw_op *rw_op;
 	rw_iter_op *iter_op;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
 	switch (opcode) {
 	case IOCB_CMD_PREAD:
 	case IOCB_CMD_PREADV:
 		mode	= FMODE_READ;
 		rw	= READ;
 		rw_op	= file->f_op->aio_read;
 		iter_op	= file->f_op->read_iter;
 		goto rw_common;
 	case IOCB_CMD_PWRITE:
 	case IOCB_CMD_PWRITEV:
 		mode	= FMODE_WRITE;
 		rw	= WRITE;
 		rw_op	= file->f_op->aio_write;
 		iter_op	= file->f_op->write_iter;
 		goto rw_common;
 rw_common:
 		if (unlikely(!(file->f_mode & mode)))
 			return -EBADF;
 		if (!rw_op && !iter_op)
 			return -EINVAL;
 		ret = (opcode == IOCB_CMD_PREADV ||
 		       opcode == IOCB_CMD_PWRITEV)
 			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
 						&iovec, compat)
 			: aio_setup_single_vector(req, rw, buf, &nr_segs,
 						  iovec);
 		if (!ret)
 			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
 		if (ret < 0) {
 			if (iovec != inline_vecs)
 				kfree(iovec);
 			return ret;
 		}
 		req->ki_nbytes = ret;
 		/* XXX: move/kill - rw_verify_area()? */
 		/* This matches the pread()/pwrite() logic */
 		if (req->ki_pos < 0) {
 			ret = -EINVAL;
 			break;
 		}
 		if (rw == WRITE)
 			file_start_write(file);
 		if (iter_op) {
 			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
 			ret = iter_op(req, &iter);
 		} else {
 			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
 		}
 		if (rw == WRITE)
 			file_end_write(file);
 		break;
 	case IOCB_CMD_FDSYNC:
 		if (!file->f_op->aio_fsync)
 			return -EINVAL;
 		ret = file->f_op->aio_fsync(req, 1);
 		break;
 	case IOCB_CMD_FSYNC:
 		if (!file->f_op->aio_fsync)
 			return -EINVAL;
 		ret = file->f_op->aio_fsync(req, 0);
 		break;
 	default:
 		pr_debug("EINVAL: no operation provided\n");
 		return -EINVAL;
 	}
 	if (iovec != inline_vecs)
 		kfree(iovec);
 	if (ret != -EIOCBQUEUED) {
 		/*
 		 * There's no easy way to restart the syscall since other AIO's
 		 * may be already running. Just fail this IO with EINTR.
 		 */
 		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
 			     ret == -ERESTARTNOHAND ||
 			     ret == -ERESTART_RESTARTBLOCK))
 			ret = -EINTR;
 		aio_complete(req, ret, 0);
 	}
 	return 0;
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
 	ssize_t ret;
 	/* enforce forwards compatibility on users */
 	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
 		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 	/* prevent overflows */
 	if (unlikely(
 	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
 	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
 	    ((ssize_t)iocb->aio_nbytes < 0)
 	   )) {
 		pr_debug("EINVAL: io_submit: overflow check\n");
 		return -EINVAL;
 	}
 	req = aio_get_req(ctx);
 	if (unlikely(!req))
 		return -EAGAIN;
 	req->ki_filp = fget(iocb->aio_fildes);
 	if (unlikely(!req->ki_filp)) {
 		ret = -EBADF;
 		goto out_put_req;
 	}
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
 		 * instance of the file* now. The file descriptor must be
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
 		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
 			goto out_put_req;
 		}
 	}
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
 		goto out_put_req;
 	}
 	req->ki_obj.user = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 	req->ki_pos = iocb->aio_offset;
 	req->ki_nbytes = iocb->aio_nbytes;
 	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
 			   (char __user *)(unsigned long)iocb->aio_buf,
 			   compat);
 	if (ret)
 		goto out_put_req;
 	return 0;
 out_put_req:
 	put_reqs_available(ctx, 1);
 	percpu_ref_put(&ctx->reqs);
 	kiocb_free(req);
 	return ret;
 }
 long do_io_submit(aio_context_t ctx_id, long nr,
 		  struct iocb __user *__user *iocbpp, bool compat)
 {
 	struct kioctx *ctx;
 	long ret = 0;
 	int i = 0;
 	struct blk_plug plug;
 	if (unlikely(nr < 0))
 		return -EINVAL;
 	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
 		nr = LONG_MAX/sizeof(*iocbpp);
 	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
 		return -EFAULT;
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx)) {
 		pr_debug("EINVAL: invalid context id\n");
 		return -EINVAL;
 	}
 	blk_start_plug(&plug);
 	/*
 	 * AKPM: should this return a partial result if some of the IOs were
 	 * successfully submitted?
 	 */
 	for (i=0; i<nr; i++) {
 		struct iocb __user *user_iocb;
 		struct iocb tmp;
 		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
 			ret = -EFAULT;
 			break;
 		}
 		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
 			ret = -EFAULT;
 			break;
 		}
 		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 	percpu_ref_put(&ctx->users);
 	return i ? i : ret;
 }
 /* sys_io_submit:
  *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
  *	the number of iocbs queued.  May return -EINVAL if the aio_context
  *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
  *	*iocbpp[0] is not properly initialized, if the operation specified
  *	is invalid for the file descriptor in the iocb.  May fail with
  *	-EFAULT if any of the data structures point to invalid data.  May
  *	fail with -EBADF if the file descriptor specified in the first
  *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
  *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
  *	fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 		struct iocb __user * __user *, iocbpp)
 {
 	return do_io_submit(ctx_id, nr, iocbpp, 0);
 }
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
 static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 				  u32 key)
 {
 	struct list_head *pos;
 	assert_spin_locked(&ctx->ctx_lock);
 	if (key != KIOCB_KEY)
 		return NULL;
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each(pos, &ctx->active_reqs) {
 		struct kiocb *kiocb = list_kiocb(pos);
 		if (kiocb->ki_obj.user == iocb)
 			return kiocb;
 	}
 	return NULL;
 }
 /* sys_io_cancel:
  *	Attempts to cancel an iocb previously passed to io_submit.  If
  *	the operation is successfully cancelled, the resulting event is
  *	copied into the memory pointed to by result without being placed
  *	into the completion queue and 0 is returned.  May fail with
  *	-EFAULT if any of the data structures pointed to are invalid.
  *	May fail with -EINVAL if aio_context specified by ctx_id is
  *	invalid.  May fail with -EAGAIN if the iocb specified was not
  *	cancelled.  Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
 {
 	struct kioctx *ctx;
 	struct kiocb *kiocb;
 	u32 key;
 	int ret;
 	ret = get_user(key, &iocb->aio_key);
 	if (unlikely(ret))
 		return -EFAULT;
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx))
 		return -EINVAL;
 	spin_lock_irq(&ctx->ctx_lock);
 	kiocb = lookup_kiocb(ctx, iocb, key);
 	if (kiocb)
 		ret = kiocb_cancel(kiocb);
 	else
 		ret = -EINVAL;
 	spin_unlock_irq(&ctx->ctx_lock);
 	if (!ret) {
 		/*
 		 * The result argument is no longer used - the io_event is
 		 * always delivered via the ring buffer. -EINPROGRESS indicates
 		 * cancellation is progress:
 		 */
 		ret = -EINPROGRESS;
 	}
 	percpu_ref_put(&ctx->users);
 	return ret;
 }
 /* io_getevents:
  *	Attempts to read at least min_nr events and up to nr events from
  *	the completion queue for the aio_context specified by ctx_id. If
  *	it succeeds, the number of read events is returned. May fail with
  *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
  *	out of range, if timeout is out of range.  May fail with -EFAULT
  *	if any of the memory specified is invalid.  May return 0 or
  *	< min_nr if the timeout specified by timeout has elapsed
  *	before sufficient events are available, where timeout == NULL
  *	specifies an infinite timeout. Note that the timeout pointed to by
  *	timeout is relative.  Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		long, min_nr,
 		long, nr,
 		struct io_event __user *, events,
 		struct timespec __user *, timeout)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx_id);
 	long ret = -EINVAL;
 	if (likely(ioctx)) {
 		if (likely(min_nr <= nr && min_nr >= 0))
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
 		percpu_ref_put(&ioctx->users);
 	}
 	return ret;
 }