Commit 8dd014adfea6f173c1ef6378f7e5e7924866c923

Authored by David Stevens
Committed by Michael S. Tsirkin
1 parent 9e3d195720

vhost-net: mergeable buffers support

This adds support for mergeable buffers in vhost-net: this is needed
for older guests without indirect buffer support, as well
as for zero copy with some devices.

Includes changes by Michael S. Tsirkin to make the
patch as low risk as possible (i.e., close to no changes
when feature is disabled).

Signed-off-by: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Showing 3 changed files with 315 additions and 18 deletions Side-by-side Diff

... ... @@ -74,6 +74,22 @@
74 74 }
75 75 return seg;
76 76 }
  77 +/* Copy iovec entries for len bytes from iovec. */
  78 +static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
  79 + size_t len, int iovcount)
  80 +{
  81 + int seg = 0;
  82 + size_t size;
  83 + while (len && seg < iovcount) {
  84 + size = min(from->iov_len, len);
  85 + to->iov_base = from->iov_base;
  86 + to->iov_len = size;
  87 + len -= size;
  88 + ++from;
  89 + ++to;
  90 + ++seg;
  91 + }
  92 +}
77 93  
78 94 /* Caller must have TX VQ lock */
79 95 static void tx_poll_stop(struct vhost_net *net)
... ... @@ -129,7 +145,7 @@
129 145  
130 146 if (wmem < sock->sk->sk_sndbuf / 2)
131 147 tx_poll_stop(net);
132   - hdr_size = vq->hdr_size;
  148 + hdr_size = vq->vhost_hlen;
133 149  
134 150 for (;;) {
135 151 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
... ... @@ -172,7 +188,7 @@
172 188 /* TODO: Check specific error and bomb out unless ENOBUFS? */
173 189 err = sock->ops->sendmsg(NULL, sock, &msg, len);
174 190 if (unlikely(err < 0)) {
175   - vhost_discard_vq_desc(vq);
  191 + vhost_discard_vq_desc(vq, 1);
176 192 tx_poll_start(net, sock);
177 193 break;
178 194 }
179 195  
... ... @@ -191,9 +207,82 @@
191 207 unuse_mm(net->dev.mm);
192 208 }
193 209  
  210 +static int peek_head_len(struct sock *sk)
  211 +{
  212 + struct sk_buff *head;
  213 + int len = 0;
  214 +
  215 + lock_sock(sk);
  216 + head = skb_peek(&sk->sk_receive_queue);
  217 + if (head)
  218 + len = head->len;
  219 + release_sock(sk);
  220 + return len;
  221 +}
  222 +
  223 +/* This is a multi-buffer version of vhost_get_desc, that works if
  224 + * vq has read descriptors only.
  225 + * @vq - the relevant virtqueue
  226 + * @datalen - data length we'll be reading
  227 + * @iovcount - returned count of io vectors we fill
  228 + * @log - vhost log
  229 + * @log_num - log offset
  230 + * returns number of buffer heads allocated, negative on error
  231 + */
  232 +static int get_rx_bufs(struct vhost_virtqueue *vq,
  233 + struct vring_used_elem *heads,
  234 + int datalen,
  235 + unsigned *iovcount,
  236 + struct vhost_log *log,
  237 + unsigned *log_num)
  238 +{
  239 + unsigned int out, in;
  240 + int seg = 0;
  241 + int headcount = 0;
  242 + unsigned d;
  243 + int r, nlogs = 0;
  244 +
  245 + while (datalen > 0) {
  246 + if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
  247 + r = -ENOBUFS;
  248 + goto err;
  249 + }
  250 + d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
  251 + ARRAY_SIZE(vq->iov) - seg, &out,
  252 + &in, log, log_num);
  253 + if (d == vq->num) {
  254 + r = 0;
  255 + goto err;
  256 + }
  257 + if (unlikely(out || in <= 0)) {
  258 + vq_err(vq, "unexpected descriptor format for RX: "
  259 + "out %d, in %d\n", out, in);
  260 + r = -EINVAL;
  261 + goto err;
  262 + }
  263 + if (unlikely(log)) {
  264 + nlogs += *log_num;
  265 + log += *log_num;
  266 + }
  267 + heads[headcount].id = d;
  268 + heads[headcount].len = iov_length(vq->iov + seg, in);
  269 + datalen -= heads[headcount].len;
  270 + ++headcount;
  271 + seg += in;
  272 + }
  273 + heads[headcount - 1].len += datalen;
  274 + *iovcount = seg;
  275 + if (unlikely(log))
  276 + *log_num = nlogs;
  277 + return headcount;
  278 +err:
  279 + vhost_discard_vq_desc(vq, headcount);
  280 + return r;
  281 +}
  282 +
194 283 /* Expects to be always run from workqueue - which acts as
195 284 * read-size critical section for our kind of RCU. */
196   -static void handle_rx(struct vhost_net *net)
  285 +static void handle_rx_big(struct vhost_net *net)
197 286 {
198 287 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
199 288 unsigned out, in, log, s;
... ... @@ -223,7 +312,7 @@
223 312 use_mm(net->dev.mm);
224 313 mutex_lock(&vq->mutex);
225 314 vhost_disable_notify(vq);
226   - hdr_size = vq->hdr_size;
  315 + hdr_size = vq->vhost_hlen;
227 316  
228 317 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
229 318 vq->log : NULL;
230 319  
... ... @@ -270,14 +359,14 @@
270 359 len, MSG_DONTWAIT | MSG_TRUNC);
271 360 /* TODO: Check specific error and bomb out unless EAGAIN? */
272 361 if (err < 0) {
273   - vhost_discard_vq_desc(vq);
  362 + vhost_discard_vq_desc(vq, 1);
274 363 break;
275 364 }
276 365 /* TODO: Should check and handle checksum. */
277 366 if (err > len) {
278 367 pr_debug("Discarded truncated rx packet: "
279 368 " len %d > %zd\n", err, len);
280   - vhost_discard_vq_desc(vq);
  369 + vhost_discard_vq_desc(vq, 1);
281 370 continue;
282 371 }
283 372 len = err;
... ... @@ -302,6 +391,123 @@
302 391 unuse_mm(net->dev.mm);
303 392 }
304 393  
  394 +/* Expects to be always run from workqueue - which acts as
  395 + * read-size critical section for our kind of RCU. */
  396 +static void handle_rx_mergeable(struct vhost_net *net)
  397 +{
  398 + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
  399 + unsigned uninitialized_var(in), log;
  400 + struct vhost_log *vq_log;
  401 + struct msghdr msg = {
  402 + .msg_name = NULL,
  403 + .msg_namelen = 0,
  404 + .msg_control = NULL, /* FIXME: get and handle RX aux data. */
  405 + .msg_controllen = 0,
  406 + .msg_iov = vq->iov,
  407 + .msg_flags = MSG_DONTWAIT,
  408 + };
  409 +
  410 + struct virtio_net_hdr_mrg_rxbuf hdr = {
  411 + .hdr.flags = 0,
  412 + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
  413 + };
  414 +
  415 + size_t total_len = 0;
  416 + int err, headcount;
  417 + size_t vhost_hlen, sock_hlen;
  418 + size_t vhost_len, sock_len;
  419 + struct socket *sock = rcu_dereference(vq->private_data);
  420 + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
  421 + return;
  422 +
  423 + use_mm(net->dev.mm);
  424 + mutex_lock(&vq->mutex);
  425 + vhost_disable_notify(vq);
  426 + vhost_hlen = vq->vhost_hlen;
  427 + sock_hlen = vq->sock_hlen;
  428 +
  429 + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
  430 + vq->log : NULL;
  431 +
  432 + while ((sock_len = peek_head_len(sock->sk))) {
  433 + sock_len += sock_hlen;
  434 + vhost_len = sock_len + vhost_hlen;
  435 + headcount = get_rx_bufs(vq, vq->heads, vhost_len,
  436 + &in, vq_log, &log);
  437 + /* On error, stop handling until the next kick. */
  438 + if (unlikely(headcount < 0))
  439 + break;
  440 + /* OK, now we need to know about added descriptors. */
  441 + if (!headcount) {
  442 + if (unlikely(vhost_enable_notify(vq))) {
  443 + /* They have slipped one in as we were
  444 + * doing that: check again. */
  445 + vhost_disable_notify(vq);
  446 + continue;
  447 + }
  448 + /* Nothing new? Wait for eventfd to tell us
  449 + * they refilled. */
  450 + break;
  451 + }
  452 + /* We don't need to be notified again. */
  453 + if (unlikely((vhost_hlen)))
  454 + /* Skip header. TODO: support TSO. */
  455 + move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
  456 + else
  457 + /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
  458 + * needed because sendmsg can modify msg_iov. */
  459 + copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
  460 + msg.msg_iovlen = in;
  461 + err = sock->ops->recvmsg(NULL, sock, &msg,
  462 + sock_len, MSG_DONTWAIT | MSG_TRUNC);
  463 + /* Userspace might have consumed the packet meanwhile:
  464 + * it's not supposed to do this usually, but might be hard
  465 + * to prevent. Discard data we got (if any) and keep going. */
  466 + if (unlikely(err != sock_len)) {
  467 + pr_debug("Discarded rx packet: "
  468 + " len %d, expected %zd\n", err, sock_len);
  469 + vhost_discard_vq_desc(vq, headcount);
  470 + continue;
  471 + }
  472 + if (unlikely(vhost_hlen) &&
  473 + memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
  474 + vhost_hlen)) {
  475 + vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
  476 + vq->iov->iov_base);
  477 + break;
  478 + }
  479 + /* TODO: Should check and handle checksum. */
  480 + if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
  481 + memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
  482 + offsetof(typeof(hdr), num_buffers),
  483 + sizeof hdr.num_buffers)) {
  484 + vq_err(vq, "Failed num_buffers write");
  485 + vhost_discard_vq_desc(vq, headcount);
  486 + break;
  487 + }
  488 + vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
  489 + headcount);
  490 + if (unlikely(vq_log))
  491 + vhost_log_write(vq, vq_log, log, vhost_len);
  492 + total_len += vhost_len;
  493 + if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
  494 + vhost_poll_queue(&vq->poll);
  495 + break;
  496 + }
  497 + }
  498 +
  499 + mutex_unlock(&vq->mutex);
  500 + unuse_mm(net->dev.mm);
  501 +}
  502 +
  503 +static void handle_rx(struct vhost_net *net)
  504 +{
  505 + if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
  506 + handle_rx_mergeable(net);
  507 + else
  508 + handle_rx_big(net);
  509 +}
  510 +
305 511 static void handle_tx_kick(struct vhost_work *work)
306 512 {
307 513 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
308 514  
... ... @@ -577,9 +783,21 @@
577 783  
578 784 static int vhost_net_set_features(struct vhost_net *n, u64 features)
579 785 {
580   - size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
581   - sizeof(struct virtio_net_hdr) : 0;
  786 + size_t vhost_hlen, sock_hlen, hdr_len;
582 787 int i;
  788 +
  789 + hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
  790 + sizeof(struct virtio_net_hdr_mrg_rxbuf) :
  791 + sizeof(struct virtio_net_hdr);
  792 + if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
  793 + /* vhost provides vnet_hdr */
  794 + vhost_hlen = hdr_len;
  795 + sock_hlen = 0;
  796 + } else {
  797 + /* socket provides vnet_hdr */
  798 + vhost_hlen = 0;
  799 + sock_hlen = hdr_len;
  800 + }
583 801 mutex_lock(&n->dev.mutex);
584 802 if ((features & (1 << VHOST_F_LOG_ALL)) &&
585 803 !vhost_log_access_ok(&n->dev)) {
... ... @@ -590,7 +808,8 @@
590 808 smp_wmb();
591 809 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
592 810 mutex_lock(&n->vqs[i].mutex);
593   - n->vqs[i].hdr_size = hdr_size;
  811 + n->vqs[i].vhost_hlen = vhost_hlen;
  812 + n->vqs[i].sock_hlen = sock_hlen;
594 813 mutex_unlock(&n->vqs[i].mutex);
595 814 }
596 815 vhost_net_flush(n);
drivers/vhost/vhost.c
... ... @@ -149,7 +149,8 @@
149 149 vq->used_flags = 0;
150 150 vq->log_used = false;
151 151 vq->log_addr = -1ull;
152   - vq->hdr_size = 0;
  152 + vq->vhost_hlen = 0;
  153 + vq->sock_hlen = 0;
153 154 vq->private_data = NULL;
154 155 vq->log_base = NULL;
155 156 vq->error_ctx = NULL;
156 157  
... ... @@ -1101,9 +1102,9 @@
1101 1102 }
1102 1103  
1103 1104 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
1104   -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
  1105 +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
1105 1106 {
1106   - vq->last_avail_idx--;
  1107 + vq->last_avail_idx -= n;
1107 1108 }
1108 1109  
1109 1110 /* After we've used one of their buffers, we tell them about it. We'll then
... ... @@ -1148,6 +1149,67 @@
1148 1149 return 0;
1149 1150 }
1150 1151  
  1152 +static int __vhost_add_used_n(struct vhost_virtqueue *vq,
  1153 + struct vring_used_elem *heads,
  1154 + unsigned count)
  1155 +{
  1156 + struct vring_used_elem __user *used;
  1157 + int start;
  1158 +
  1159 + start = vq->last_used_idx % vq->num;
  1160 + used = vq->used->ring + start;
  1161 + if (copy_to_user(used, heads, count * sizeof *used)) {
  1162 + vq_err(vq, "Failed to write used");
  1163 + return -EFAULT;
  1164 + }
  1165 + if (unlikely(vq->log_used)) {
  1166 + /* Make sure data is seen before log. */
  1167 + smp_wmb();
  1168 + /* Log used ring entry write. */
  1169 + log_write(vq->log_base,
  1170 + vq->log_addr +
  1171 + ((void __user *)used - (void __user *)vq->used),
  1172 + count * sizeof *used);
  1173 + }
  1174 + vq->last_used_idx += count;
  1175 + return 0;
  1176 +}
  1177 +
  1178 +/* After we've used one of their buffers, we tell them about it. We'll then
  1179 + * want to notify the guest, using eventfd. */
  1180 +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
  1181 + unsigned count)
  1182 +{
  1183 + int start, n, r;
  1184 +
  1185 + start = vq->last_used_idx % vq->num;
  1186 + n = vq->num - start;
  1187 + if (n < count) {
  1188 + r = __vhost_add_used_n(vq, heads, n);
  1189 + if (r < 0)
  1190 + return r;
  1191 + heads += n;
  1192 + count -= n;
  1193 + }
  1194 + r = __vhost_add_used_n(vq, heads, count);
  1195 +
  1196 + /* Make sure buffer is written before we update index. */
  1197 + smp_wmb();
  1198 + if (put_user(vq->last_used_idx, &vq->used->idx)) {
  1199 + vq_err(vq, "Failed to increment used idx");
  1200 + return -EFAULT;
  1201 + }
  1202 + if (unlikely(vq->log_used)) {
  1203 + /* Log used index update. */
  1204 + log_write(vq->log_base,
  1205 + vq->log_addr + offsetof(struct vring_used, idx),
  1206 + sizeof vq->used->idx);
  1207 + if (vq->log_ctx)
  1208 + eventfd_signal(vq->log_ctx, 1);
  1209 + }
  1210 + return r;
  1211 +}
  1212 +
1151 1213 /* This actually signals the guest, using eventfd. */
1152 1214 void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1153 1215 {
... ... @@ -1182,6 +1244,15 @@
1182 1244 vhost_signal(dev, vq);
1183 1245 }
1184 1246  
  1247 +/* multi-buffer version of vhost_add_used_and_signal */
  1248 +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
  1249 + struct vhost_virtqueue *vq,
  1250 + struct vring_used_elem *heads, unsigned count)
  1251 +{
  1252 + vhost_add_used_n(vq, heads, count);
  1253 + vhost_signal(dev, vq);
  1254 +}
  1255 +
1185 1256 /* OK, now we need to know about added descriptors. */
1186 1257 bool vhost_enable_notify(struct vhost_virtqueue *vq)
1187 1258 {
... ... @@ -1206,7 +1277,7 @@
1206 1277 return false;
1207 1278 }
1208 1279  
1209   - return avail_idx != vq->last_avail_idx;
  1280 + return avail_idx != vq->avail_idx;
1210 1281 }
1211 1282  
1212 1283 /* We don't need to be notified again. */
drivers/vhost/vhost.h
... ... @@ -96,7 +96,9 @@
96 96 struct iovec indirect[VHOST_NET_MAX_SG];
97 97 struct iovec iov[VHOST_NET_MAX_SG];
98 98 struct iovec hdr[VHOST_NET_MAX_SG];
99   - size_t hdr_size;
  99 + size_t vhost_hlen;
  100 + size_t sock_hlen;
  101 + struct vring_used_elem heads[VHOST_NET_MAX_SG];
100 102 /* We use a kind of RCU to access private pointer.
101 103 * All readers access it from worker, which makes it possible to
102 104 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
103 105  
104 106  
... ... @@ -139,12 +141,16 @@
139 141 struct iovec iov[], unsigned int iov_count,
140 142 unsigned int *out_num, unsigned int *in_num,
141 143 struct vhost_log *log, unsigned int *log_num);
142   -void vhost_discard_vq_desc(struct vhost_virtqueue *);
  144 +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
143 145  
144 146 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
145   -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
  147 +int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
  148 + unsigned count);
146 149 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
147   - unsigned int head, int len);
  150 + unsigned int id, int len);
  151 +void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
  152 + struct vring_used_elem *heads, unsigned count);
  153 +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
148 154 void vhost_disable_notify(struct vhost_virtqueue *);
149 155 bool vhost_enable_notify(struct vhost_virtqueue *);
150 156  
... ... @@ -161,7 +167,8 @@
161 167 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
162 168 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
163 169 (1 << VHOST_F_LOG_ALL) |
164   - (1 << VHOST_NET_F_VIRTIO_NET_HDR),
  170 + (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
  171 + (1 << VIRTIO_NET_F_MRG_RXBUF),
165 172 };
166 173  
167 174 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)