Blame view

drivers/vhost/vhost.c 40.5 KB
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1
2
3
4
5
6
  /* Copyright (C) 2009 Red Hat, Inc.
   * Copyright (C) 2006 Rusty Russell IBM Corporation
   *
   * Author: Michael S. Tsirkin <mst@redhat.com>
   *
   * Inspiration, some code, and most witty comments come from
615165875   Rob Landley   Correct occurrenc...
7
   * Documentation/virtual/lguest/lguest.c, by Rusty Russell
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
8
9
10
11
12
13
14
15
16
17
   *
   * This work is licensed under the terms of the GNU GPL, version 2.
   *
   * Generic code for virtio server in host kernel.
   */
  
  #include <linux/eventfd.h>
  #include <linux/vhost.h>
  #include <linux/virtio_net.h>
  #include <linux/mm.h>
64e1c8074   Michael S. Tsirkin   vhost-net: batch ...
18
  #include <linux/mmu_context.h>
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
19
20
  #include <linux/miscdevice.h>
  #include <linux/mutex.h>
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
21
22
23
24
  #include <linux/rcupdate.h>
  #include <linux/poll.h>
  #include <linux/file.h>
  #include <linux/highmem.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
25
  #include <linux/slab.h>
c23f3445e   Tejun Heo   vhost: replace vh...
26
  #include <linux/kthread.h>
9e3d19572   Michael S. Tsirkin   vhost: apply cgro...
27
  #include <linux/cgroup.h>
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
28
29
30
31
  
  #include <linux/net.h>
  #include <linux/if_packet.h>
  #include <linux/if_arp.h>
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
32
33
34
35
36
37
  #include "vhost.h"
  
  enum {
  	VHOST_MEMORY_MAX_NREGIONS = 64,
  	VHOST_MEMORY_F_LOG = 0x1,
  };
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
38
  static unsigned vhost_zcopy_mask __read_mostly;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
39
40
  #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
  #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
41
42
43
44
  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
  			    poll_table *pt)
  {
  	struct vhost_poll *poll;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
45

d47effe1b   Krishna Kumar   vhost: Cleanup vh...
46
  	poll = container_of(pt, struct vhost_poll, table);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
47
48
49
50
51
52
53
  	poll->wqh = wqh;
  	add_wait_queue(wqh, &poll->wait);
  }
  
  static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
  			     void *key)
  {
c23f3445e   Tejun Heo   vhost: replace vh...
54
  	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
55
56
  	if (!((unsigned long)key & poll->mask))
  		return 0;
c23f3445e   Tejun Heo   vhost: replace vh...
57
  	vhost_poll_queue(poll);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
58
59
  	return 0;
  }
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
60
61
62
63
64
65
66
67
  static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
  {
  	INIT_LIST_HEAD(&work->node);
  	work->fn = fn;
  	init_waitqueue_head(&work->done);
  	work->flushing = 0;
  	work->queue_seq = work->done_seq = 0;
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
68
  /* Init poll structure */
c23f3445e   Tejun Heo   vhost: replace vh...
69
70
  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
  		     unsigned long mask, struct vhost_dev *dev)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
71
  {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
72
73
74
  	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
  	init_poll_funcptr(&poll->table, vhost_poll_func);
  	poll->mask = mask;
c23f3445e   Tejun Heo   vhost: replace vh...
75
  	poll->dev = dev;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
76
  	vhost_work_init(&poll->work, fn);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
77
78
79
80
81
82
83
  }
  
  /* Start polling a file. We add ourselves to file's wait queue. The caller must
   * keep a reference to a file until after vhost_poll_stop is called. */
  void vhost_poll_start(struct vhost_poll *poll, struct file *file)
  {
  	unsigned long mask;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
84

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
85
86
87
88
89
90
91
92
93
94
95
  	mask = file->f_op->poll(file, &poll->table);
  	if (mask)
  		vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);
  }
  
  /* Stop polling a file. After this function returns, it becomes safe to drop the
   * file reference. You must also flush afterwards. */
  void vhost_poll_stop(struct vhost_poll *poll)
  {
  	remove_wait_queue(poll->wqh, &poll->wait);
  }
0174b0c30   Michael S. Tsirkin   vhost: fix signed...
96
97
98
99
  static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
  				unsigned seq)
  {
  	int left;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
100

0174b0c30   Michael S. Tsirkin   vhost: fix signed...
101
102
103
104
105
  	spin_lock_irq(&dev->work_lock);
  	left = seq - work->done_seq;
  	spin_unlock_irq(&dev->work_lock);
  	return left <= 0;
  }
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
106
  static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
107
  {
c23f3445e   Tejun Heo   vhost: replace vh...
108
  	unsigned seq;
c23f3445e   Tejun Heo   vhost: replace vh...
109
  	int flushing;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
110
  	spin_lock_irq(&dev->work_lock);
c23f3445e   Tejun Heo   vhost: replace vh...
111
112
  	seq = work->queue_seq;
  	work->flushing++;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
113
  	spin_unlock_irq(&dev->work_lock);
0174b0c30   Michael S. Tsirkin   vhost: fix signed...
114
  	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
115
  	spin_lock_irq(&dev->work_lock);
c23f3445e   Tejun Heo   vhost: replace vh...
116
  	flushing = --work->flushing;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
117
  	spin_unlock_irq(&dev->work_lock);
c23f3445e   Tejun Heo   vhost: replace vh...
118
  	BUG_ON(flushing < 0);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
119
  }
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
120
121
122
123
124
125
126
127
128
  /* Flush any work that has been scheduled. When calling this, don't hold any
   * locks that are also used by the callback. */
  void vhost_poll_flush(struct vhost_poll *poll)
  {
  	vhost_work_flush(poll->dev, &poll->work);
  }
  
  static inline void vhost_work_queue(struct vhost_dev *dev,
  				    struct vhost_work *work)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
129
  {
c23f3445e   Tejun Heo   vhost: replace vh...
130
131
132
133
134
135
136
137
138
  	unsigned long flags;
  
  	spin_lock_irqsave(&dev->work_lock, flags);
  	if (list_empty(&work->node)) {
  		list_add_tail(&work->node, &dev->work_list);
  		work->queue_seq++;
  		wake_up_process(dev->worker);
  	}
  	spin_unlock_irqrestore(&dev->work_lock, flags);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
139
  }
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
140
141
142
143
  void vhost_poll_queue(struct vhost_poll *poll)
  {
  	vhost_work_queue(poll->dev, &poll->work);
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
144
145
146
147
148
149
150
151
152
153
  static void vhost_vq_reset(struct vhost_dev *dev,
  			   struct vhost_virtqueue *vq)
  {
  	vq->num = 1;
  	vq->desc = NULL;
  	vq->avail = NULL;
  	vq->used = NULL;
  	vq->last_avail_idx = 0;
  	vq->avail_idx = 0;
  	vq->last_used_idx = 0;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
154
155
  	vq->signalled_used = 0;
  	vq->signalled_used_valid = false;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
156
  	vq->used_flags = 0;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
157
158
  	vq->log_used = false;
  	vq->log_addr = -1ull;
8dd014adf   David Stevens   vhost-net: mergea...
159
160
  	vq->vhost_hlen = 0;
  	vq->sock_hlen = 0;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
161
162
163
164
165
166
167
  	vq->private_data = NULL;
  	vq->log_base = NULL;
  	vq->error_ctx = NULL;
  	vq->error = NULL;
  	vq->kick = NULL;
  	vq->call_ctx = NULL;
  	vq->call = NULL;
73a99f083   Michael S. Tsirkin   vhost: initialize...
168
  	vq->log_ctx = NULL;
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
169
170
171
  	vq->upend_idx = 0;
  	vq->done_idx = 0;
  	vq->ubufs = NULL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
172
  }
c23f3445e   Tejun Heo   vhost: replace vh...
173
174
175
176
177
  static int vhost_worker(void *data)
  {
  	struct vhost_dev *dev = data;
  	struct vhost_work *work = NULL;
  	unsigned uninitialized_var(seq);
64e1c8074   Michael S. Tsirkin   vhost-net: batch ...
178
  	use_mm(dev->mm);
c23f3445e   Tejun Heo   vhost: replace vh...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  	for (;;) {
  		/* mb paired w/ kthread_stop */
  		set_current_state(TASK_INTERRUPTIBLE);
  
  		spin_lock_irq(&dev->work_lock);
  		if (work) {
  			work->done_seq = seq;
  			if (work->flushing)
  				wake_up_all(&work->done);
  		}
  
  		if (kthread_should_stop()) {
  			spin_unlock_irq(&dev->work_lock);
  			__set_current_state(TASK_RUNNING);
64e1c8074   Michael S. Tsirkin   vhost-net: batch ...
193
  			break;
c23f3445e   Tejun Heo   vhost: replace vh...
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
  		}
  		if (!list_empty(&dev->work_list)) {
  			work = list_first_entry(&dev->work_list,
  						struct vhost_work, node);
  			list_del_init(&work->node);
  			seq = work->queue_seq;
  		} else
  			work = NULL;
  		spin_unlock_irq(&dev->work_lock);
  
  		if (work) {
  			__set_current_state(TASK_RUNNING);
  			work->fn(work);
  		} else
  			schedule();
  
  	}
64e1c8074   Michael S. Tsirkin   vhost-net: batch ...
211
212
  	unuse_mm(dev->mm);
  	return 0;
c23f3445e   Tejun Heo   vhost: replace vh...
213
  }
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
  static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
  {
  	kfree(vq->indirect);
  	vq->indirect = NULL;
  	kfree(vq->log);
  	vq->log = NULL;
  	kfree(vq->heads);
  	vq->heads = NULL;
  	kfree(vq->ubuf_info);
  	vq->ubuf_info = NULL;
  }
  
  void vhost_enable_zcopy(int vq)
  {
  	vhost_zcopy_mask |= 0x1 << vq;
  }
e0e9b4064   Jason Wang   vhost: max s/g to...
230
231
232
233
  /* Helper to allocate iovec buffers for all vqs. */
  static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
  {
  	int i;
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
234
  	bool zcopy;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
235

e0e9b4064   Jason Wang   vhost: max s/g to...
236
237
238
239
240
241
242
  	for (i = 0; i < dev->nvqs; ++i) {
  		dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
  					       UIO_MAXIOV, GFP_KERNEL);
  		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
  					  GFP_KERNEL);
  		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
  					    UIO_MAXIOV, GFP_KERNEL);
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
243
244
245
246
247
  		zcopy = vhost_zcopy_mask & (0x1 << i);
  		if (zcopy)
  			dev->vqs[i].ubuf_info =
  				kmalloc(sizeof *dev->vqs[i].ubuf_info *
  					UIO_MAXIOV, GFP_KERNEL);
e0e9b4064   Jason Wang   vhost: max s/g to...
248
  		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
249
250
  			!dev->vqs[i].heads ||
  			(zcopy && !dev->vqs[i].ubuf_info))
e0e9b4064   Jason Wang   vhost: max s/g to...
251
252
253
  			goto err_nomem;
  	}
  	return 0;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
254

e0e9b4064   Jason Wang   vhost: max s/g to...
255
  err_nomem:
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
256
257
  	for (; i >= 0; --i)
  		vhost_vq_free_iovecs(&dev->vqs[i]);
e0e9b4064   Jason Wang   vhost: max s/g to...
258
259
260
261
262
263
  	return -ENOMEM;
  }
  
  static void vhost_dev_free_iovecs(struct vhost_dev *dev)
  {
  	int i;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
264

bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
265
266
  	for (i = 0; i < dev->nvqs; ++i)
  		vhost_vq_free_iovecs(&dev->vqs[i]);
e0e9b4064   Jason Wang   vhost: max s/g to...
267
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
268
269
270
271
  long vhost_dev_init(struct vhost_dev *dev,
  		    struct vhost_virtqueue *vqs, int nvqs)
  {
  	int i;
c23f3445e   Tejun Heo   vhost: replace vh...
272

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
273
274
275
276
277
278
279
  	dev->vqs = vqs;
  	dev->nvqs = nvqs;
  	mutex_init(&dev->mutex);
  	dev->log_ctx = NULL;
  	dev->log_file = NULL;
  	dev->memory = NULL;
  	dev->mm = NULL;
c23f3445e   Tejun Heo   vhost: replace vh...
280
281
282
  	spin_lock_init(&dev->work_lock);
  	INIT_LIST_HEAD(&dev->work_list);
  	dev->worker = NULL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
283
284
  
  	for (i = 0; i < dev->nvqs; ++i) {
e0e9b4064   Jason Wang   vhost: max s/g to...
285
286
287
  		dev->vqs[i].log = NULL;
  		dev->vqs[i].indirect = NULL;
  		dev->vqs[i].heads = NULL;
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
288
  		dev->vqs[i].ubuf_info = NULL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
289
290
291
292
293
  		dev->vqs[i].dev = dev;
  		mutex_init(&dev->vqs[i].mutex);
  		vhost_vq_reset(dev, dev->vqs + i);
  		if (dev->vqs[i].handle_kick)
  			vhost_poll_init(&dev->vqs[i].poll,
c23f3445e   Tejun Heo   vhost: replace vh...
294
  					dev->vqs[i].handle_kick, POLLIN, dev);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
295
  	}
c23f3445e   Tejun Heo   vhost: replace vh...
296

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
297
298
299
300
301
302
303
304
305
  	return 0;
  }
  
  /* Caller should have device mutex */
  long vhost_dev_check_owner(struct vhost_dev *dev)
  {
  	/* Are you the owner? If not, I don't think you mean to do that */
  	return dev->mm == current->mm ? 0 : -EPERM;
  }
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
306
  struct vhost_attach_cgroups_struct {
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
307
308
309
  	struct vhost_work work;
  	struct task_struct *owner;
  	int ret;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
310
311
312
313
  };
  
  static void vhost_attach_cgroups_work(struct vhost_work *work)
  {
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
314
315
316
317
  	struct vhost_attach_cgroups_struct *s;
  
  	s = container_of(work, struct vhost_attach_cgroups_struct, work);
  	s->ret = cgroup_attach_task_all(s->owner, current);
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
318
319
320
321
  }
  
  static int vhost_attach_cgroups(struct vhost_dev *dev)
  {
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
322
323
324
325
326
327
328
  	struct vhost_attach_cgroups_struct attach;
  
  	attach.owner = current;
  	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
  	vhost_work_queue(dev, &attach.work);
  	vhost_work_flush(dev, &attach.work);
  	return attach.ret;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
329
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
330
331
332
  /* Caller should have device mutex */
  static long vhost_dev_set_owner(struct vhost_dev *dev)
  {
c23f3445e   Tejun Heo   vhost: replace vh...
333
334
  	struct task_struct *worker;
  	int err;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
335

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
336
  	/* Is there an owner already? */
c23f3445e   Tejun Heo   vhost: replace vh...
337
338
339
340
  	if (dev->mm) {
  		err = -EBUSY;
  		goto err_mm;
  	}
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
341

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
342
343
  	/* No owner, become one */
  	dev->mm = get_task_mm(current);
c23f3445e   Tejun Heo   vhost: replace vh...
344
345
346
347
348
349
350
  	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
  	if (IS_ERR(worker)) {
  		err = PTR_ERR(worker);
  		goto err_worker;
  	}
  
  	dev->worker = worker;
87d6a412b   Michael S. Tsirkin   vhost: fix attach...
351
352
353
  	wake_up_process(worker);	/* avoid contributing to loadavg */
  
  	err = vhost_attach_cgroups(dev);
9e3d19572   Michael S. Tsirkin   vhost: apply cgro...
354
355
  	if (err)
  		goto err_cgroup;
c23f3445e   Tejun Heo   vhost: replace vh...
356

e0e9b4064   Jason Wang   vhost: max s/g to...
357
358
359
  	err = vhost_dev_alloc_iovecs(dev);
  	if (err)
  		goto err_cgroup;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
360
  	return 0;
9e3d19572   Michael S. Tsirkin   vhost: apply cgro...
361
362
  err_cgroup:
  	kthread_stop(worker);
615cc2211   Michael S. Tsirkin   vhost: error hand...
363
  	dev->worker = NULL;
c23f3445e   Tejun Heo   vhost: replace vh...
364
365
366
367
368
369
  err_worker:
  	if (dev->mm)
  		mmput(dev->mm);
  	dev->mm = NULL;
  err_mm:
  	return err;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
  }
  
  /* Caller should have device mutex */
  long vhost_dev_reset_owner(struct vhost_dev *dev)
  {
  	struct vhost_memory *memory;
  
  	/* Restore memory to default empty mapping. */
  	memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
  	if (!memory)
  		return -ENOMEM;
  
  	vhost_dev_cleanup(dev);
  
  	memory->nregions = 0;
28457ee69   Arnd Bergmann   vhost: add __rcu ...
385
  	RCU_INIT_POINTER(dev->memory, memory);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
386
387
  	return 0;
  }
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
  /* In case of DMA done not in order in lower device driver for some reason.
   * upend_idx is used to track end of used idx, done_idx is used to track head
   * of used idx. Once lower device DMA done contiguously, we will signal KVM
   * guest used idx.
   */
  int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
  {
  	int i;
  	int j = 0;
  
  	for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
  		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
  			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
  			vhost_add_used_and_signal(vq->dev, vq,
  						  vq->heads[i].id, 0);
  			++j;
  		} else
  			break;
  	}
  	if (j)
  		vq->done_idx = i;
  	return j;
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
411
412
413
414
  /* Caller should have device mutex */
  void vhost_dev_cleanup(struct vhost_dev *dev)
  {
  	int i;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
415

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
416
417
418
419
420
  	for (i = 0; i < dev->nvqs; ++i) {
  		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
  			vhost_poll_stop(&dev->vqs[i].poll);
  			vhost_poll_flush(&dev->vqs[i].poll);
  		}
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
421
422
423
424
425
426
  		/* Wait for all lower device DMAs done. */
  		if (dev->vqs[i].ubufs)
  			vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
  
  		/* Signal guest as appropriate. */
  		vhost_zerocopy_signal_used(&dev->vqs[i]);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
427
428
429
430
431
432
433
434
435
436
437
438
  		if (dev->vqs[i].error_ctx)
  			eventfd_ctx_put(dev->vqs[i].error_ctx);
  		if (dev->vqs[i].error)
  			fput(dev->vqs[i].error);
  		if (dev->vqs[i].kick)
  			fput(dev->vqs[i].kick);
  		if (dev->vqs[i].call_ctx)
  			eventfd_ctx_put(dev->vqs[i].call_ctx);
  		if (dev->vqs[i].call)
  			fput(dev->vqs[i].call);
  		vhost_vq_reset(dev, dev->vqs + i);
  	}
e0e9b4064   Jason Wang   vhost: max s/g to...
439
  	vhost_dev_free_iovecs(dev);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
440
441
442
443
444
445
446
  	if (dev->log_ctx)
  		eventfd_ctx_put(dev->log_ctx);
  	dev->log_ctx = NULL;
  	if (dev->log_file)
  		fput(dev->log_file);
  	dev->log_file = NULL;
  	/* No one will access memory at this point */
28457ee69   Arnd Bergmann   vhost: add __rcu ...
447
448
449
  	kfree(rcu_dereference_protected(dev->memory,
  					lockdep_is_held(&dev->mutex)));
  	RCU_INIT_POINTER(dev->memory, NULL);
c23f3445e   Tejun Heo   vhost: replace vh...
450
  	WARN_ON(!list_empty(&dev->work_list));
78b620ce9   Eric Dumazet   vhost: stop worke...
451
452
453
454
  	if (dev->worker) {
  		kthread_stop(dev->worker);
  		dev->worker = NULL;
  	}
533a19b4b   Michael S. Tsirkin   vhost: put mm aft...
455
456
457
  	if (dev->mm)
  		mmput(dev->mm);
  	dev->mm = NULL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
458
459
460
461
462
  }
  
  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
  {
  	u64 a = addr / VHOST_PAGE_SIZE / 8;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
463

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
464
465
466
  	/* Make sure 64 bit math will not overflow. */
  	if (a > ULONG_MAX - (unsigned long)log_base ||
  	    a + (unsigned long)log_base > ULONG_MAX)
6d97e55f7   Dan Carpenter   vhost: fix return...
467
  		return 0;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
468
469
470
471
472
473
474
475
476
477
  
  	return access_ok(VERIFY_WRITE, log_base + a,
  			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
  }
  
  /* Caller should have vq mutex and device mutex. */
  static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
  			       int log_all)
  {
  	int i;
179b284e2   Jeff Dike   vhost-net: fix vq...
478

f8322fbe0   Michael S. Tsirkin   vhost: whitespace...
479
480
  	if (!mem)
  		return 0;
179b284e2   Jeff Dike   vhost-net: fix vq...
481

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
  	for (i = 0; i < mem->nregions; ++i) {
  		struct vhost_memory_region *m = mem->regions + i;
  		unsigned long a = m->userspace_addr;
  		if (m->memory_size > ULONG_MAX)
  			return 0;
  		else if (!access_ok(VERIFY_WRITE, (void __user *)a,
  				    m->memory_size))
  			return 0;
  		else if (log_all && !log_access_ok(log_base,
  						   m->guest_phys_addr,
  						   m->memory_size))
  			return 0;
  	}
  	return 1;
  }
  
  /* Can we switch to this memory table? */
  /* Caller should have device mutex but not vq mutex */
  static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
  			    int log_all)
  {
  	int i;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
504

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
  	for (i = 0; i < d->nvqs; ++i) {
  		int ok;
  		mutex_lock(&d->vqs[i].mutex);
  		/* If ring is inactive, will check when it's enabled. */
  		if (d->vqs[i].private_data)
  			ok = vq_memory_access_ok(d->vqs[i].log_base, mem,
  						 log_all);
  		else
  			ok = 1;
  		mutex_unlock(&d->vqs[i].mutex);
  		if (!ok)
  			return 0;
  	}
  	return 1;
  }
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
520
  static int vq_access_ok(struct vhost_dev *d, unsigned int num,
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
521
522
523
524
  			struct vring_desc __user *desc,
  			struct vring_avail __user *avail,
  			struct vring_used __user *used)
  {
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
525
  	size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
526
527
  	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
  	       access_ok(VERIFY_READ, avail,
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
528
  			 sizeof *avail + num * sizeof *avail->ring + s) &&
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
529
  	       access_ok(VERIFY_WRITE, used,
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
530
  			sizeof *used + num * sizeof *used->ring + s);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
531
532
533
534
535
536
  }
  
  /* Can we log writes? */
  /* Caller should have device mutex but not vq mutex */
  int vhost_log_access_ok(struct vhost_dev *dev)
  {
28457ee69   Arnd Bergmann   vhost: add __rcu ...
537
538
539
540
541
  	struct vhost_memory *mp;
  
  	mp = rcu_dereference_protected(dev->memory,
  				       lockdep_is_held(&dev->mutex));
  	return memory_access_ok(dev, mp, 1);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
542
543
544
545
  }
  
  /* Verify access for write logging. */
  /* Caller should have vq mutex and device mutex */
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
546
547
  static int vq_log_access_ok(struct vhost_dev *d, struct vhost_virtqueue *vq,
  			    void __user *log_base)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
548
  {
28457ee69   Arnd Bergmann   vhost: add __rcu ...
549
  	struct vhost_memory *mp;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
550
  	size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
28457ee69   Arnd Bergmann   vhost: add __rcu ...
551
552
553
554
  
  	mp = rcu_dereference_protected(vq->dev->memory,
  				       lockdep_is_held(&vq->mutex));
  	return vq_memory_access_ok(log_base, mp,
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
555
556
557
  			    vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
  		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
  					sizeof *vq->used +
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
558
  					vq->num * sizeof *vq->used->ring + s));
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
559
560
561
562
563
564
  }
  
  /* Can we start vq? */
  /* Caller should have vq mutex and device mutex */
  int vhost_vq_access_ok(struct vhost_virtqueue *vq)
  {
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
565
566
  	return vq_access_ok(vq->dev, vq->num, vq->desc, vq->avail, vq->used) &&
  		vq_log_access_ok(vq->dev, vq, vq->log_base);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
567
568
569
570
571
572
  }
  
  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
  {
  	struct vhost_memory mem, *newmem, *oldmem;
  	unsigned long size = offsetof(struct vhost_memory, regions);
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
573

7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
574
575
  	if (copy_from_user(&mem, m, size))
  		return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
576
577
578
579
580
581
582
583
584
  	if (mem.padding)
  		return -EOPNOTSUPP;
  	if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
  		return -E2BIG;
  	newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL);
  	if (!newmem)
  		return -ENOMEM;
  
  	memcpy(newmem, &mem, size);
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
585
586
  	if (copy_from_user(newmem->regions, m->regions,
  			   mem.nregions * sizeof *m->regions)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
587
  		kfree(newmem);
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
588
  		return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
589
  	}
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
590
591
  	if (!memory_access_ok(d, newmem,
  			      vhost_has_feature(d, VHOST_F_LOG_ALL))) {
a02c37891   Takuya Yoshikawa   vhost: fix the me...
592
  		kfree(newmem);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
593
  		return -EFAULT;
a02c37891   Takuya Yoshikawa   vhost: fix the me...
594
  	}
28457ee69   Arnd Bergmann   vhost: add __rcu ...
595
596
  	oldmem = rcu_dereference_protected(d->memory,
  					   lockdep_is_held(&d->mutex));
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
597
598
599
600
601
  	rcu_assign_pointer(d->memory, newmem);
  	synchronize_rcu();
  	kfree(oldmem);
  	return 0;
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
  static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
  {
  	struct file *eventfp, *filep = NULL,
  		    *pollstart = NULL, *pollstop = NULL;
  	struct eventfd_ctx *ctx = NULL;
  	u32 __user *idxp = argp;
  	struct vhost_virtqueue *vq;
  	struct vhost_vring_state s;
  	struct vhost_vring_file f;
  	struct vhost_vring_addr a;
  	u32 idx;
  	long r;
  
  	r = get_user(idx, idxp);
  	if (r < 0)
  		return r;
0f3d9a174   Krishna Kumar   vhost: Fix host p...
618
  	if (idx >= d->nvqs)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
619
620
621
622
623
624
625
626
627
628
629
630
631
632
  		return -ENOBUFS;
  
  	vq = d->vqs + idx;
  
  	mutex_lock(&vq->mutex);
  
  	switch (ioctl) {
  	case VHOST_SET_VRING_NUM:
  		/* Resizing ring with an active backend?
  		 * You don't want to do that. */
  		if (vq->private_data) {
  			r = -EBUSY;
  			break;
  		}
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
633
634
  		if (copy_from_user(&s, argp, sizeof s)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
635
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
636
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
637
638
639
640
641
642
643
644
645
646
647
648
649
  		if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) {
  			r = -EINVAL;
  			break;
  		}
  		vq->num = s.num;
  		break;
  	case VHOST_SET_VRING_BASE:
  		/* Moving base with an active backend?
  		 * You don't want to do that. */
  		if (vq->private_data) {
  			r = -EBUSY;
  			break;
  		}
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
650
651
  		if (copy_from_user(&s, argp, sizeof s)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
652
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
653
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
654
655
656
657
658
659
660
661
662
663
664
  		if (s.num > 0xffff) {
  			r = -EINVAL;
  			break;
  		}
  		vq->last_avail_idx = s.num;
  		/* Forget the cached index value. */
  		vq->avail_idx = vq->last_avail_idx;
  		break;
  	case VHOST_GET_VRING_BASE:
  		s.index = idx;
  		s.num = vq->last_avail_idx;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
665
666
  		if (copy_to_user(argp, &s, sizeof s))
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
667
668
  		break;
  	case VHOST_SET_VRING_ADDR:
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
669
670
  		if (copy_from_user(&a, argp, sizeof a)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
671
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
672
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
  		if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {
  			r = -EOPNOTSUPP;
  			break;
  		}
  		/* For 32bit, verify that the top 32bits of the user
  		   data are set to zero. */
  		if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
  		    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
  		    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {
  			r = -EFAULT;
  			break;
  		}
  		if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) ||
  		    (a.used_user_addr & (sizeof *vq->used->ring - 1)) ||
  		    (a.log_guest_addr & (sizeof *vq->used->ring - 1))) {
  			r = -EINVAL;
  			break;
  		}
  
  		/* We only verify access here if backend is configured.
  		 * If it is not, we don't as size might not have been setup.
  		 * We will verify when backend is configured. */
  		if (vq->private_data) {
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
696
  			if (!vq_access_ok(d, vq->num,
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
  				(void __user *)(unsigned long)a.desc_user_addr,
  				(void __user *)(unsigned long)a.avail_user_addr,
  				(void __user *)(unsigned long)a.used_user_addr)) {
  				r = -EINVAL;
  				break;
  			}
  
  			/* Also validate log access for used ring if enabled. */
  			if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
  			    !log_access_ok(vq->log_base, a.log_guest_addr,
  					   sizeof *vq->used +
  					   vq->num * sizeof *vq->used->ring)) {
  				r = -EINVAL;
  				break;
  			}
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
713
714
715
716
717
718
719
  		vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
  		vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
  		vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
  		vq->log_addr = a.log_guest_addr;
  		vq->used = (void __user *)(unsigned long)a.used_user_addr;
  		break;
  	case VHOST_SET_VRING_KICK:
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
720
721
  		if (copy_from_user(&f, argp, sizeof f)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
722
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
723
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
724
  		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
535297a6a   Michael S. Tsirkin   vhost: fix error ...
725
726
727
728
  		if (IS_ERR(eventfp)) {
  			r = PTR_ERR(eventfp);
  			break;
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
729
730
731
732
733
734
735
  		if (eventfp != vq->kick) {
  			pollstop = filep = vq->kick;
  			pollstart = vq->kick = eventfp;
  		} else
  			filep = eventfp;
  		break;
  	case VHOST_SET_VRING_CALL:
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
736
737
  		if (copy_from_user(&f, argp, sizeof f)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
738
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
739
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
740
  		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
535297a6a   Michael S. Tsirkin   vhost: fix error ...
741
742
743
744
  		if (IS_ERR(eventfp)) {
  			r = PTR_ERR(eventfp);
  			break;
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
745
746
747
748
749
750
751
752
753
754
  		if (eventfp != vq->call) {
  			filep = vq->call;
  			ctx = vq->call_ctx;
  			vq->call = eventfp;
  			vq->call_ctx = eventfp ?
  				eventfd_ctx_fileget(eventfp) : NULL;
  		} else
  			filep = eventfp;
  		break;
  	case VHOST_SET_VRING_ERR:
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
755
756
  		if (copy_from_user(&f, argp, sizeof f)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
757
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
758
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
759
  		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
535297a6a   Michael S. Tsirkin   vhost: fix error ...
760
761
762
763
  		if (IS_ERR(eventfp)) {
  			r = PTR_ERR(eventfp);
  			break;
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
  		if (eventfp != vq->error) {
  			filep = vq->error;
  			vq->error = eventfp;
  			ctx = vq->error_ctx;
  			vq->error_ctx = eventfp ?
  				eventfd_ctx_fileget(eventfp) : NULL;
  		} else
  			filep = eventfp;
  		break;
  	default:
  		r = -ENOIOCTLCMD;
  	}
  
  	if (pollstop && vq->handle_kick)
  		vhost_poll_stop(&vq->poll);
  
  	if (ctx)
  		eventfd_ctx_put(ctx);
  	if (filep)
  		fput(filep);
  
  	if (pollstart && vq->handle_kick)
  		vhost_poll_start(&vq->poll, vq->kick);
  
  	mutex_unlock(&vq->mutex);
  
  	if (pollstop && vq->handle_kick)
  		vhost_poll_flush(&vq->poll);
  	return r;
  }
  
  /* Caller must have device mutex */
  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
  {
  	void __user *argp = (void __user *)arg;
  	struct file *eventfp, *filep = NULL;
  	struct eventfd_ctx *ctx = NULL;
  	u64 p;
  	long r;
  	int i, fd;
  
  	/* If you are not the owner, you can become one */
  	if (ioctl == VHOST_SET_OWNER) {
  		r = vhost_dev_set_owner(d);
  		goto done;
  	}
  
  	/* You must be the owner to do anything else */
  	r = vhost_dev_check_owner(d);
  	if (r)
  		goto done;
  
  	switch (ioctl) {
  	case VHOST_SET_MEM_TABLE:
  		r = vhost_set_memory(d, argp);
  		break;
  	case VHOST_SET_LOG_BASE:
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
821
822
  		if (copy_from_user(&p, argp, sizeof p)) {
  			r = -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
823
  			break;
7ad9c9d27   Takuya Yoshikawa   vhost: fix to che...
824
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
825
826
827
828
829
830
831
832
833
834
  		if ((u64)(unsigned long)p != p) {
  			r = -EFAULT;
  			break;
  		}
  		for (i = 0; i < d->nvqs; ++i) {
  			struct vhost_virtqueue *vq;
  			void __user *base = (void __user *)(unsigned long)p;
  			vq = d->vqs + i;
  			mutex_lock(&vq->mutex);
  			/* If ring is inactive, will check when it's enabled. */
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
835
  			if (vq->private_data && !vq_log_access_ok(d, vq, base))
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
  				r = -EFAULT;
  			else
  				vq->log_base = base;
  			mutex_unlock(&vq->mutex);
  		}
  		break;
  	case VHOST_SET_LOG_FD:
  		r = get_user(fd, (int __user *)argp);
  		if (r < 0)
  			break;
  		eventfp = fd == -1 ? NULL : eventfd_fget(fd);
  		if (IS_ERR(eventfp)) {
  			r = PTR_ERR(eventfp);
  			break;
  		}
  		if (eventfp != d->log_file) {
  			filep = d->log_file;
  			ctx = d->log_ctx;
  			d->log_ctx = eventfp ?
  				eventfd_ctx_fileget(eventfp) : NULL;
  		} else
  			filep = eventfp;
  		for (i = 0; i < d->nvqs; ++i) {
  			mutex_lock(&d->vqs[i].mutex);
  			d->vqs[i].log_ctx = d->log_ctx;
  			mutex_unlock(&d->vqs[i].mutex);
  		}
  		if (ctx)
  			eventfd_ctx_put(ctx);
  		if (filep)
  			fput(filep);
  		break;
  	default:
  		r = vhost_set_vring(d, ioctl, argp);
  		break;
  	}
  done:
  	return r;
  }
  
  static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
  						     __u64 addr, __u32 len)
  {
  	struct vhost_memory_region *reg;
  	int i;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
881

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
  	/* linear search is not brilliant, but we really have on the order of 6
  	 * regions in practice */
  	for (i = 0; i < mem->nregions; ++i) {
  		reg = mem->regions + i;
  		if (reg->guest_phys_addr <= addr &&
  		    reg->guest_phys_addr + reg->memory_size - 1 >= addr)
  			return reg;
  	}
  	return NULL;
  }
  
  /* TODO: This is really inefficient.  We need something like get_user()
   * (instruction directly accesses the data, with an exception table entry
   * returning -EFAULT). See Documentation/x86/exception-tables.txt.
   */
  static int set_bit_to_user(int nr, void __user *addr)
  {
  	unsigned long log = (unsigned long)addr;
  	struct page *page;
  	void *base;
  	int bit = nr + (log % PAGE_SIZE) * 8;
  	int r;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
904

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
905
  	r = get_user_pages_fast(log, 1, 1, &page);
d6db3f5c1   Michael S. Tsirkin   vhost: fix get_us...
906
  	if (r < 0)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
907
  		return r;
d6db3f5c1   Michael S. Tsirkin   vhost: fix get_us...
908
  	BUG_ON(r != 1);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
909
910
911
912
913
914
915
916
917
918
919
  	base = kmap_atomic(page, KM_USER0);
  	set_bit(bit, base);
  	kunmap_atomic(base, KM_USER0);
  	set_page_dirty_lock(page);
  	put_page(page);
  	return 0;
  }
  
  static int log_write(void __user *log_base,
  		     u64 write_address, u64 write_length)
  {
28831ee60   Michael S. Tsirkin   vhost: better var...
920
  	u64 write_page = write_address / VHOST_PAGE_SIZE;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
921
  	int r;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
922

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
923
924
  	if (!write_length)
  		return 0;
3bf9be40f   Michael S. Tsirkin   vhost: correctly ...
925
  	write_length += write_address % VHOST_PAGE_SIZE;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
926
927
  	for (;;) {
  		u64 base = (u64)(unsigned long)log_base;
28831ee60   Michael S. Tsirkin   vhost: better var...
928
929
  		u64 log = base + write_page / 8;
  		int bit = write_page % 8;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
930
931
932
933
934
935
936
937
  		if ((u64)(unsigned long)log != log)
  			return -EFAULT;
  		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
  		if (r < 0)
  			return r;
  		if (write_length <= VHOST_PAGE_SIZE)
  			break;
  		write_length -= VHOST_PAGE_SIZE;
28831ee60   Michael S. Tsirkin   vhost: better var...
938
  		write_page += 1;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
939
940
941
942
943
944
945
946
947
948
  	}
  	return r;
  }
  
  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
  		    unsigned int log_num, u64 len)
  {
  	int i, r;
  
  	/* Make sure data written is seen before log. */
5659338c8   Michael S. Tsirkin   vhost-net: switch...
949
  	smp_wmb();
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
950
951
952
953
954
955
  	for (i = 0; i < log_num; ++i) {
  		u64 l = min(log[i].len, len);
  		r = log_write(vq->log_base, log[i].addr, l);
  		if (r < 0)
  			return r;
  		len -= l;
5786aee8b   Michael S. Tsirkin   vhost: fix log ct...
956
957
958
  		if (!len) {
  			if (vq->log_ctx)
  				eventfd_signal(vq->log_ctx, 1);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
959
  			return 0;
5786aee8b   Michael S. Tsirkin   vhost: fix log ct...
960
  		}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
961
  	}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
962
963
964
965
  	/* Length written exceeds what we have stored. This is a bug. */
  	BUG();
  	return 0;
  }
2723feaa8   Jason Wang   vhost: set log wh...
966
967
968
  static int vhost_update_used_flags(struct vhost_virtqueue *vq)
  {
  	void __user *used;
b834226b0   Michael S. Tsirkin   vhost: optimize i...
969
  	if (__put_user(vq->used_flags, &vq->used->flags) < 0)
2723feaa8   Jason Wang   vhost: set log wh...
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
  		return -EFAULT;
  	if (unlikely(vq->log_used)) {
  		/* Make sure the flag is seen before log. */
  		smp_wmb();
  		/* Log used flag write. */
  		used = &vq->used->flags;
  		log_write(vq->log_base, vq->log_addr +
  			  (used - (void __user *)vq->used),
  			  sizeof vq->used->flags);
  		if (vq->log_ctx)
  			eventfd_signal(vq->log_ctx, 1);
  	}
  	return 0;
  }
  
  static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
  {
b834226b0   Michael S. Tsirkin   vhost: optimize i...
987
  	if (__put_user(vq->avail_idx, vhost_avail_event(vq)))
2723feaa8   Jason Wang   vhost: set log wh...
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
  		return -EFAULT;
  	if (unlikely(vq->log_used)) {
  		void __user *used;
  		/* Make sure the event is seen before log. */
  		smp_wmb();
  		/* Log avail event write */
  		used = vhost_avail_event(vq);
  		log_write(vq->log_base, vq->log_addr +
  			  (used - (void __user *)vq->used),
  			  sizeof *vhost_avail_event(vq));
  		if (vq->log_ctx)
  			eventfd_signal(vq->log_ctx, 1);
  	}
  	return 0;
  }
  
  int vhost_init_used(struct vhost_virtqueue *vq)
  {
  	int r;
  	if (!vq->private_data)
  		return 0;
  
  	r = vhost_update_used_flags(vq);
  	if (r)
  		return r;
  	vq->signalled_used_valid = false;
  	return get_user(vq->last_used_idx, &vq->used->idx);
  }
a8d3782f9   Christoph Hellwig   vhost: fix sparse...
1016
1017
  static int translate_desc(struct vhost_dev *dev, u64 addr, u32 len,
  			  struct iovec iov[], int iov_size)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
  {
  	const struct vhost_memory_region *reg;
  	struct vhost_memory *mem;
  	struct iovec *_iov;
  	u64 s = 0;
  	int ret = 0;
  
  	rcu_read_lock();
  
  	mem = rcu_dereference(dev->memory);
  	while ((u64)len > s) {
  		u64 size;
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1030
  		if (unlikely(ret >= iov_size)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1031
1032
1033
1034
  			ret = -ENOBUFS;
  			break;
  		}
  		reg = find_region(mem, addr, len);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1035
  		if (unlikely(!reg)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1036
1037
1038
1039
1040
1041
  			ret = -EFAULT;
  			break;
  		}
  		_iov = iov + ret;
  		size = reg->memory_size - addr + reg->guest_phys_addr;
  		_iov->iov_len = min((u64)len, size);
a8d3782f9   Christoph Hellwig   vhost: fix sparse...
1042
  		_iov->iov_base = (void __user *)(unsigned long)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
  			(reg->userspace_addr + addr - reg->guest_phys_addr);
  		s += size;
  		addr += size;
  		++ret;
  	}
  
  	rcu_read_unlock();
  	return ret;
  }
  
  /* Each buffer in the virtqueues is actually a chain of descriptors.  This
   * function returns the next descriptor in the chain,
   * or -1U if we're at the end. */
  static unsigned next_desc(struct vring_desc *desc)
  {
  	unsigned int next;
  
  	/* If this descriptor says it doesn't chain, we're done. */
  	if (!(desc->flags & VRING_DESC_F_NEXT))
  		return -1U;
  
  	/* Check they're not leading us off end of descriptors. */
  	next = desc->next;
  	/* Make sure compiler knows to grab that: we don't want it changing! */
  	/* We will use the result as an index in an array, so most
  	 * architectures only need a compiler barrier here. */
  	read_barrier_depends();
  
  	return next;
  }
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1073
1074
1075
1076
1077
  static int get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
  			struct iovec iov[], unsigned int iov_size,
  			unsigned int *out_num, unsigned int *in_num,
  			struct vhost_log *log, unsigned int *log_num,
  			struct vring_desc *indirect)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1078
1079
1080
1081
1082
1083
  {
  	struct vring_desc desc;
  	unsigned int i = 0, count, found = 0;
  	int ret;
  
  	/* Sanity check */
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1084
  	if (unlikely(indirect->len % sizeof desc)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1085
1086
1087
1088
1089
1090
1091
1092
1093
  		vq_err(vq, "Invalid length in indirect descriptor: "
  		       "len 0x%llx not multiple of 0x%zx
  ",
  		       (unsigned long long)indirect->len,
  		       sizeof desc);
  		return -EINVAL;
  	}
  
  	ret = translate_desc(dev, indirect->addr, indirect->len, vq->indirect,
e0e9b4064   Jason Wang   vhost: max s/g to...
1094
  			     UIO_MAXIOV);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1095
  	if (unlikely(ret < 0)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
  		vq_err(vq, "Translation failure %d in indirect.
  ", ret);
  		return ret;
  	}
  
  	/* We will use the result as an address to read from, so most
  	 * architectures only need a compiler barrier here. */
  	read_barrier_depends();
  
  	count = indirect->len / sizeof desc;
  	/* Buffers are chained via a 16 bit next field, so
  	 * we can have at most 2^16 of these. */
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1108
  	if (unlikely(count > USHRT_MAX + 1)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1109
1110
1111
1112
1113
1114
1115
1116
  		vq_err(vq, "Indirect buffer length too big: %d
  ",
  		       indirect->len);
  		return -E2BIG;
  	}
  
  	do {
  		unsigned iov_count = *in_num + *out_num;
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1117
  		if (unlikely(++found > count)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1118
1119
1120
1121
1122
1123
  			vq_err(vq, "Loop detected: last one at %u "
  			       "indirect size %u
  ",
  			       i, count);
  			return -EINVAL;
  		}
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
1124
1125
  		if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
  					      vq->indirect, sizeof desc))) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1126
1127
1128
1129
1130
  			vq_err(vq, "Failed indirect descriptor: idx %d, %zx
  ",
  			       i, (size_t)indirect->addr + i * sizeof desc);
  			return -EINVAL;
  		}
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1131
  		if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1132
1133
1134
1135
1136
1137
1138
1139
  			vq_err(vq, "Nested indirect descriptor: idx %d, %zx
  ",
  			       i, (size_t)indirect->addr + i * sizeof desc);
  			return -EINVAL;
  		}
  
  		ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
  				     iov_size - iov_count);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1140
  		if (unlikely(ret < 0)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
  			vq_err(vq, "Translation failure %d indirect idx %d
  ",
  			       ret, i);
  			return ret;
  		}
  		/* If this is an input descriptor, increment that count. */
  		if (desc.flags & VRING_DESC_F_WRITE) {
  			*in_num += ret;
  			if (unlikely(log)) {
  				log[*log_num].addr = desc.addr;
  				log[*log_num].len = desc.len;
  				++*log_num;
  			}
  		} else {
  			/* If it's an output descriptor, they're all supposed
  			 * to come before any input descriptors. */
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1157
  			if (unlikely(*in_num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
  				vq_err(vq, "Indirect descriptor "
  				       "has out after in: idx %d
  ", i);
  				return -EINVAL;
  			}
  			*out_num += ret;
  		}
  	} while ((i = next_desc(&desc)) != -1);
  	return 0;
  }
  
  /* This looks in the virtqueue and for the first available buffer, and converts
   * it to an iovec for convenient access.  Since descriptors consist of some
   * number of output then some number of input descriptors, it's actually two
   * iovecs, but we pack them into one and note how many of each there were.
   *
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1174
1175
1176
1177
1178
1179
1180
   * This function returns the descriptor number found, or vq->num (which is
   * never a valid descriptor number) if none was found.  A negative code is
   * returned on error. */
  int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
  		      struct iovec iov[], unsigned int iov_size,
  		      unsigned int *out_num, unsigned int *in_num,
  		      struct vhost_log *log, unsigned int *log_num)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1181
1182
1183
1184
1185
1186
1187
1188
  {
  	struct vring_desc desc;
  	unsigned int i, head, found = 0;
  	u16 last_avail_idx;
  	int ret;
  
  	/* Check it isn't doing very strange things with descriptor numbers. */
  	last_avail_idx = vq->last_avail_idx;
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1189
  	if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1190
1191
1192
  		vq_err(vq, "Failed to access avail idx at %p
  ",
  		       &vq->avail->idx);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1193
  		return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1194
  	}
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1195
  	if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1196
1197
  		vq_err(vq, "Guest moved used index from %u to %u",
  		       last_avail_idx, vq->avail_idx);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1198
  		return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1199
1200
1201
1202
1203
1204
1205
  	}
  
  	/* If there's nothing new since last we looked, return invalid. */
  	if (vq->avail_idx == last_avail_idx)
  		return vq->num;
  
  	/* Only get avail ring entries after they have been exposed by guest. */
5659338c8   Michael S. Tsirkin   vhost-net: switch...
1206
  	smp_rmb();
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1207
1208
1209
  
  	/* Grab the next descriptor number they're advertising, and increment
  	 * the index we've seen. */
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1210
1211
  	if (unlikely(__get_user(head,
  				&vq->avail->ring[last_avail_idx % vq->num]))) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1212
1213
1214
1215
  		vq_err(vq, "Failed to read head: idx %d address %p
  ",
  		       last_avail_idx,
  		       &vq->avail->ring[last_avail_idx % vq->num]);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1216
  		return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1217
1218
1219
  	}
  
  	/* If their number is silly, that's an error. */
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1220
  	if (unlikely(head >= vq->num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1221
1222
  		vq_err(vq, "Guest says index %u > %u is available",
  		       head, vq->num);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1223
  		return -EINVAL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
  	}
  
  	/* When we start there are none of either input nor output. */
  	*out_num = *in_num = 0;
  	if (unlikely(log))
  		*log_num = 0;
  
  	i = head;
  	do {
  		unsigned iov_count = *in_num + *out_num;
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1234
  		if (unlikely(i >= vq->num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1235
1236
  			vq_err(vq, "Desc index is %u > %u, head = %u",
  			       i, vq->num, head);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1237
  			return -EINVAL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1238
  		}
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1239
  		if (unlikely(++found > vq->num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1240
1241
1242
1243
  			vq_err(vq, "Loop detected: last one at %u "
  			       "vq size %u head %u
  ",
  			       i, vq->num, head);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1244
  			return -EINVAL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1245
  		}
fcc042a28   Michael S. Tsirkin   vhost: copy_from_...
1246
  		ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1247
  		if (unlikely(ret)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1248
1249
1250
  			vq_err(vq, "Failed to get descriptor: idx %d addr %p
  ",
  			       i, vq->desc + i);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1251
  			return -EFAULT;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1252
1253
1254
1255
1256
  		}
  		if (desc.flags & VRING_DESC_F_INDIRECT) {
  			ret = get_indirect(dev, vq, iov, iov_size,
  					   out_num, in_num,
  					   log, log_num, &desc);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1257
  			if (unlikely(ret < 0)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1258
1259
1260
  				vq_err(vq, "Failure detected "
  				       "in indirect descriptor at idx %d
  ", i);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1261
  				return ret;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1262
1263
1264
1265
1266
1267
  			}
  			continue;
  		}
  
  		ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
  				     iov_size - iov_count);
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1268
  		if (unlikely(ret < 0)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1269
1270
1271
  			vq_err(vq, "Translation failure %d descriptor idx %d
  ",
  			       ret, i);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1272
  			return ret;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
  		}
  		if (desc.flags & VRING_DESC_F_WRITE) {
  			/* If this is an input descriptor,
  			 * increment that count. */
  			*in_num += ret;
  			if (unlikely(log)) {
  				log[*log_num].addr = desc.addr;
  				log[*log_num].len = desc.len;
  				++*log_num;
  			}
  		} else {
  			/* If it's an output descriptor, they're all supposed
  			 * to come before any input descriptors. */
7b3384fc3   Michael S. Tsirkin   vhost: add unlike...
1286
  			if (unlikely(*in_num)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1287
1288
1289
  				vq_err(vq, "Descriptor has out after in: "
  				       "idx %d
  ", i);
d5675bd20   Michael S. Tsirkin   vhost: break out ...
1290
  				return -EINVAL;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1291
1292
1293
1294
1295
1296
1297
  			}
  			*out_num += ret;
  		}
  	} while ((i = next_desc(&desc)) != -1);
  
  	/* On success, increment avail index. */
  	vq->last_avail_idx++;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1298
1299
1300
1301
  
  	/* Assume notifications from guest are disabled at this point,
  	 * if they aren't we would need to update avail_event index. */
  	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1302
1303
1304
1305
  	return head;
  }
  
  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
8dd014adf   David Stevens   vhost-net: mergea...
1306
  void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1307
  {
8dd014adf   David Stevens   vhost-net: mergea...
1308
  	vq->last_avail_idx -= n;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1309
1310
1311
1312
1313
1314
  }
  
  /* After we've used one of their buffers, we tell them about it.  We'll then
   * want to notify the guest, using eventfd. */
  int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
  {
a8d3782f9   Christoph Hellwig   vhost: fix sparse...
1315
  	struct vring_used_elem __user *used;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1316
1317
1318
1319
  
  	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
  	 * next entry in that used ring. */
  	used = &vq->used->ring[vq->last_used_idx % vq->num];
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1320
  	if (__put_user(head, &used->id)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1321
1322
1323
  		vq_err(vq, "Failed to write used id");
  		return -EFAULT;
  	}
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1324
  	if (__put_user(len, &used->len)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1325
1326
1327
1328
  		vq_err(vq, "Failed to write used len");
  		return -EFAULT;
  	}
  	/* Make sure buffer is written before we update index. */
5659338c8   Michael S. Tsirkin   vhost-net: switch...
1329
  	smp_wmb();
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1330
  	if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1331
1332
1333
1334
1335
  		vq_err(vq, "Failed to increment used idx");
  		return -EFAULT;
  	}
  	if (unlikely(vq->log_used)) {
  		/* Make sure data is seen before log. */
5659338c8   Michael S. Tsirkin   vhost-net: switch...
1336
  		smp_wmb();
86e9424d7   Michael S. Tsirkin   vhost: logging th...
1337
1338
  		/* Log used ring entry write. */
  		log_write(vq->log_base,
a8d3782f9   Christoph Hellwig   vhost: fix sparse...
1339
1340
  			  vq->log_addr +
  			   ((void __user *)used - (void __user *)vq->used),
86e9424d7   Michael S. Tsirkin   vhost: logging th...
1341
1342
1343
1344
1345
  			  sizeof *used);
  		/* Log used index update. */
  		log_write(vq->log_base,
  			  vq->log_addr + offsetof(struct vring_used, idx),
  			  sizeof vq->used->idx);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1346
1347
1348
1349
  		if (vq->log_ctx)
  			eventfd_signal(vq->log_ctx, 1);
  	}
  	vq->last_used_idx++;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1350
1351
1352
1353
1354
1355
  	/* If the driver never bothers to signal in a very long while,
  	 * used index might wrap around. If that happens, invalidate
  	 * signalled_used index we stored. TODO: make sure driver
  	 * signals at least once in 2^16 and remove this. */
  	if (unlikely(vq->last_used_idx == vq->signalled_used))
  		vq->signalled_used_valid = false;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1356
1357
  	return 0;
  }
8dd014adf   David Stevens   vhost-net: mergea...
1358
1359
1360
1361
1362
  static int __vhost_add_used_n(struct vhost_virtqueue *vq,
  			    struct vring_used_elem *heads,
  			    unsigned count)
  {
  	struct vring_used_elem __user *used;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1363
  	u16 old, new;
8dd014adf   David Stevens   vhost-net: mergea...
1364
1365
1366
1367
  	int start;
  
  	start = vq->last_used_idx % vq->num;
  	used = vq->used->ring + start;
dfe5ac5b1   Michael S. Tsirkin   vhost: copy_to_us...
1368
  	if (__copy_to_user(used, heads, count * sizeof *used)) {
8dd014adf   David Stevens   vhost-net: mergea...
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
  		vq_err(vq, "Failed to write used");
  		return -EFAULT;
  	}
  	if (unlikely(vq->log_used)) {
  		/* Make sure data is seen before log. */
  		smp_wmb();
  		/* Log used ring entry write. */
  		log_write(vq->log_base,
  			  vq->log_addr +
  			   ((void __user *)used - (void __user *)vq->used),
  			  count * sizeof *used);
  	}
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1381
1382
1383
1384
1385
1386
1387
1388
  	old = vq->last_used_idx;
  	new = (vq->last_used_idx += count);
  	/* If the driver never bothers to signal in a very long while,
  	 * used index might wrap around. If that happens, invalidate
  	 * signalled_used index we stored. TODO: make sure driver
  	 * signals at least once in 2^16 and remove this. */
  	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
  		vq->signalled_used_valid = false;
8dd014adf   David Stevens   vhost-net: mergea...
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
  	return 0;
  }
  
  /* After we've used one of their buffers, we tell them about it.  We'll then
   * want to notify the guest, using eventfd. */
  int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
  		     unsigned count)
  {
  	int start, n, r;
  
  	start = vq->last_used_idx % vq->num;
  	n = vq->num - start;
  	if (n < count) {
  		r = __vhost_add_used_n(vq, heads, n);
  		if (r < 0)
  			return r;
  		heads += n;
  		count -= n;
  	}
  	r = __vhost_add_used_n(vq, heads, count);
  
  	/* Make sure buffer is written before we update index. */
  	smp_wmb();
  	if (put_user(vq->last_used_idx, &vq->used->idx)) {
  		vq_err(vq, "Failed to increment used idx");
  		return -EFAULT;
  	}
  	if (unlikely(vq->log_used)) {
  		/* Log used index update. */
  		log_write(vq->log_base,
  			  vq->log_addr + offsetof(struct vring_used, idx),
  			  sizeof vq->used->idx);
  		if (vq->log_ctx)
  			eventfd_signal(vq->log_ctx, 1);
  	}
  	return r;
  }
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1426
  static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1427
  {
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1428
1429
  	__u16 old, new, event;
  	bool v;
0d4993563   Michael S. Tsirkin   vhost: fix barrie...
1430
1431
1432
1433
  	/* Flush out used index updates. This is paired
  	 * with the barrier that the Guest executes when enabling
  	 * interrupts. */
  	smp_mb();
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
  	if (vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
  	    unlikely(vq->avail_idx == vq->last_avail_idx))
  		return true;
  
  	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
  		__u16 flags;
  		if (__get_user(flags, &vq->avail->flags)) {
  			vq_err(vq, "Failed to get flags");
  			return true;
  		}
  		return !(flags & VRING_AVAIL_F_NO_INTERRUPT);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1445
  	}
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1446
1447
1448
1449
  	old = vq->signalled_used;
  	v = vq->signalled_used_valid;
  	new = vq->signalled_used = vq->last_used_idx;
  	vq->signalled_used_valid = true;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1450

8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1451
1452
  	if (unlikely(!v))
  		return true;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1453

8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
  	if (get_user(event, vhost_used_event(vq))) {
  		vq_err(vq, "Failed to get used event idx");
  		return true;
  	}
  	return vring_need_event(event, new, old);
  }
  
  /* This actually signals the guest, using eventfd. */
  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  {
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1464
  	/* Signal the Guest tell them we used something up. */
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1465
  	if (vq->call_ctx && vhost_notify(dev, vq))
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
  		eventfd_signal(vq->call_ctx, 1);
  }
  
  /* And here's the combo meal deal.  Supersize me! */
  void vhost_add_used_and_signal(struct vhost_dev *dev,
  			       struct vhost_virtqueue *vq,
  			       unsigned int head, int len)
  {
  	vhost_add_used(vq, head, len);
  	vhost_signal(dev, vq);
  }
8dd014adf   David Stevens   vhost-net: mergea...
1477
1478
1479
1480
1481
1482
1483
1484
  /* multi-buffer version of vhost_add_used_and_signal */
  void vhost_add_used_and_signal_n(struct vhost_dev *dev,
  				 struct vhost_virtqueue *vq,
  				 struct vring_used_elem *heads, unsigned count)
  {
  	vhost_add_used_n(vq, heads, count);
  	vhost_signal(dev, vq);
  }
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1485
  /* OK, now we need to know about added descriptors. */
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1486
  bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1487
1488
1489
  {
  	u16 avail_idx;
  	int r;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
1490

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1491
1492
1493
  	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
  		return false;
  	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1494
  	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
2723feaa8   Jason Wang   vhost: set log wh...
1495
  		r = vhost_update_used_flags(vq);
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1496
1497
1498
1499
1500
1501
1502
  		if (r) {
  			vq_err(vq, "Failed to enable notification at %p: %d
  ",
  			       &vq->used->flags, r);
  			return false;
  		}
  	} else {
2723feaa8   Jason Wang   vhost: set log wh...
1503
  		r = vhost_update_avail_event(vq, vq->avail_idx);
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1504
1505
1506
1507
1508
1509
1510
  		if (r) {
  			vq_err(vq, "Failed to update avail event index at %p: %d
  ",
  			       vhost_avail_event(vq), r);
  			return false;
  		}
  	}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1511
1512
  	/* They could have slipped one in as we were doing that: make
  	 * sure it's written, then check again. */
5659338c8   Michael S. Tsirkin   vhost-net: switch...
1513
  	smp_mb();
8b7347aab   Michael S. Tsirkin   vhost: get/put_us...
1514
  	r = __get_user(avail_idx, &vq->avail->idx);
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1515
1516
1517
1518
1519
1520
  	if (r) {
  		vq_err(vq, "Failed to check avail idx at %p: %d
  ",
  		       &vq->avail->idx, r);
  		return false;
  	}
8dd014adf   David Stevens   vhost-net: mergea...
1521
  	return avail_idx != vq->avail_idx;
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1522
1523
1524
  }
  
  /* We don't need to be notified again. */
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1525
  void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1526
1527
  {
  	int r;
d47effe1b   Krishna Kumar   vhost: Cleanup vh...
1528

3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1529
1530
1531
  	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
  		return;
  	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1532
  	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
2723feaa8   Jason Wang   vhost: set log wh...
1533
  		r = vhost_update_used_flags(vq);
8ea8cf89e   Michael S. Tsirkin   vhost: support ev...
1534
1535
1536
1537
1538
  		if (r)
  			vq_err(vq, "Failed to enable notification at %p: %d
  ",
  			       &vq->used->flags, r);
  	}
3a4d5c94e   Michael S. Tsirkin   vhost_net: a kern...
1539
  }
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
  
  static void vhost_zerocopy_done_signal(struct kref *kref)
  {
  	struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
  						    kref);
  	wake_up(&ubufs->wait);
  }
  
  struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
  					bool zcopy)
  {
  	struct vhost_ubuf_ref *ubufs;
  	/* No zero copy backend? Nothing to count. */
  	if (!zcopy)
  		return NULL;
  	ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
  	if (!ubufs)
  		return ERR_PTR(-ENOMEM);
  	kref_init(&ubufs->kref);
bab632d69   Michael S. Tsirkin   vhost: vhost TX z...
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
  	init_waitqueue_head(&ubufs->wait);
  	ubufs->vq = vq;
  	return ubufs;
  }
  
  void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
  {
  	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
  }
  
  void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
  {
  	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
  	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
  	kfree(ubufs);
  }
  
  void vhost_zerocopy_callback(void *arg)
  {
  	struct ubuf_info *ubuf = arg;
  	struct vhost_ubuf_ref *ubufs = ubuf->arg;
  	struct vhost_virtqueue *vq = ubufs->vq;
  
  	/* set len = 1 to mark this desc buffers done DMA */
  	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
  	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
  }