[dpdk-dev,v2,1/7] vhost: refactor rte_vhost_dequeue_burst
Commit Message
The current rte_vhost_dequeue_burst() implementation is a bit messy
and logic twisted. And you could see repeat code here and there: it
invokes rte_pktmbuf_alloc() three times at three different places!
However, rte_vhost_dequeue_burst() acutally does a simple job: copy
the packet data from vring desc to mbuf. What's tricky here is:
- desc buff could be chained (by desc->next field), so that you need
fetch next one if current is wholly drained.
- One mbuf could not be big enough to hold all desc buff, hence you
need to chain the mbuf as well, by the mbuf->next field.
Even though, the logic could be simple. Here is the pseudo code.
while (this_desc_is_not_drained_totally || has_next_desc) {
if (this_desc_has_drained_totally) {
this_desc = next_desc();
}
if (mbuf_has_no_room) {
mbuf = allocate_a_new_mbuf();
}
COPY(mbuf, desc);
}
And this is how I refactored rte_vhost_dequeue_burst.
Note that the old patch does a special handling for skipping virtio
header. However, that could be simply done by adjusting desc_avail
and desc_offset var:
desc_avail = desc->len - vq->vhost_hlen;
desc_offset = vq->vhost_hlen;
This refactor makes the code much more readable (IMO), yet it reduces
binary code size (nearly 2K).
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
v2: - fix potential NULL dereference bug of var "prev" and "head"
---
lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
1 file changed, 116 insertions(+), 181 deletions(-)
Comments
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> The current rte_vhost_dequeue_burst() implementation is a bit messy
> and logic twisted. And you could see repeat code here and there: it
> invokes rte_pktmbuf_alloc() three times at three different places!
>
> However, rte_vhost_dequeue_burst() acutally does a simple job: copy
> the packet data from vring desc to mbuf. What's tricky here is:
>
> - desc buff could be chained (by desc->next field), so that you need
> fetch next one if current is wholly drained.
>
> - One mbuf could not be big enough to hold all desc buff, hence you
> need to chain the mbuf as well, by the mbuf->next field.
>
> Even though, the logic could be simple. Here is the pseudo code.
>
> while (this_desc_is_not_drained_totally || has_next_desc) {
> if (this_desc_has_drained_totally) {
> this_desc = next_desc();
> }
>
> if (mbuf_has_no_room) {
> mbuf = allocate_a_new_mbuf();
> }
>
> COPY(mbuf, desc);
> }
>
> And this is how I refactored rte_vhost_dequeue_burst.
>
> Note that the old patch does a special handling for skipping virtio
> header. However, that could be simply done by adjusting desc_avail
> and desc_offset var:
>
> desc_avail = desc->len - vq->vhost_hlen;
> desc_offset = vq->vhost_hlen;
>
> This refactor makes the code much more readable (IMO), yet it reduces
> binary code size (nearly 2K).
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
>
> v2: - fix potential NULL dereference bug of var "prev" and "head"
> ---
> lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
> 1 file changed, 116 insertions(+), 181 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 5e7e5b1..d5cd0fa 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> }
> }
>
> +static inline struct rte_mbuf *
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> + uint16_t desc_idx, struct rte_mempool *mbuf_pool)
> +{
> + struct vring_desc *desc;
> + uint64_t desc_addr;
> + uint32_t desc_avail, desc_offset;
> + uint32_t mbuf_avail, mbuf_offset;
> + uint32_t cpy_len;
> + struct rte_mbuf *head = NULL;
> + struct rte_mbuf *cur = NULL, *prev = NULL;
> + struct virtio_net_hdr *hdr;
> +
> + desc = &vq->desc[desc_idx];
> + desc_addr = gpa_to_vva(dev, desc->addr);
> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> + /* Retrieve virtio net header */
> + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
> + desc_avail = desc->len - vq->vhost_hlen;
There is a serious bug here, desc->len - vq->vhost_len could overflow.
VM could easily create this case. Let us fix it here.
> + desc_offset = vq->vhost_hlen;
> +
> + mbuf_avail = 0;
> + mbuf_offset = 0;
> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> + /* This desc reachs to its end, get the next one */
> + if (desc_avail == 0) {
> + desc = &vq->desc[desc->next];
> +
> + desc_addr = gpa_to_vva(dev, desc->addr);
> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> + desc_offset = 0;
> + desc_avail = desc->len;
> +
> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> + }
> +
> + /*
> + * This mbuf reachs to its end, get a new one
> + * to hold more data.
> + */
> + if (mbuf_avail == 0) {
> + cur = rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(!cur)) {
> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
> + "allocate memory for mbuf.\n");
> + if (head)
> + rte_pktmbuf_free(head);
> + return NULL;
> + }
> + if (!head) {
> + head = cur;
> + } else {
> + prev->next = cur;
> + prev->data_len = mbuf_offset;
> + head->nb_segs += 1;
> + }
> + head->pkt_len += mbuf_offset;
> + prev = cur;
> +
> + mbuf_offset = 0;
> + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> + }
> +
> + cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
> + (void *)((uintptr_t)(desc_addr + desc_offset)),
> + cpy_len);
> +
> + mbuf_avail -= cpy_len;
> + mbuf_offset += cpy_len;
> + desc_avail -= cpy_len;
> + desc_offset += cpy_len;
> + }
> +
> + if (prev) {
> + prev->data_len = mbuf_offset;
> + head->pkt_len += mbuf_offset;
> +
> + if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
> + vhost_dequeue_offload(hdr, head);
> + }
> +
> + return head;
> +}
> +
> uint16_t
> rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> {
> - struct rte_mbuf *m, *prev;
> struct vhost_virtqueue *vq;
> - struct vring_desc *desc;
> - uint64_t vb_addr = 0;
> - uint64_t vb_net_hdr_addr = 0;
> - uint32_t head[MAX_PKT_BURST];
> + uint32_t desc_indexes[MAX_PKT_BURST];
> uint32_t used_idx;
> uint32_t i;
> - uint16_t free_entries, entry_success = 0;
> + uint16_t free_entries;
> uint16_t avail_idx;
> - struct virtio_net_hdr *hdr = NULL;
> + struct rte_mbuf *m;
>
> if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
> RTE_LOG(ERR, VHOST_DATA,
> @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> return 0;
>
> avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> -
> - /* If there are no available buffers then return. */
> - if (vq->last_used_idx == avail_idx)
> + free_entries = avail_idx - vq->last_used_idx;
> + if (free_entries == 0)
> return 0;
>
> - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
> - dev->device_fh);
> + LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh);
>
> - /* Prefetch available ring to retrieve head indexes. */
> - rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> + used_idx = vq->last_used_idx & (vq->size -1);
>
> - /*get the number of free entries in the ring*/
> - free_entries = (avail_idx - vq->last_used_idx);
> + /* Prefetch available ring to retrieve head indexes. */
> + rte_prefetch0(&vq->avail->ring[used_idx]);
>
> - free_entries = RTE_MIN(free_entries, count);
> - /* Limit to MAX_PKT_BURST. */
> - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> + count = RTE_MIN(count, MAX_PKT_BURST);
> + count = RTE_MIN(count, free_entries);
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n",
> + dev->device_fh, count);
>
> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> - dev->device_fh, free_entries);
> /* Retrieve all of the head indexes first to avoid caching issues. */
> - for (i = 0; i < free_entries; i++)
> - head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
> + for (i = 0; i < count; i++) {
> + desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> + (vq->size - 1)];
> + }
>
> /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success]]);
> + rte_prefetch0(&vq->desc[desc_indexes[0]]);
> rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
>
> - while (entry_success < free_entries) {
> - uint32_t vb_avail, vb_offset;
> - uint32_t seg_avail, seg_offset;
> - uint32_t cpy_len;
> - uint32_t seg_num = 0;
> - struct rte_mbuf *cur;
> - uint8_t alloc_err = 0;
> -
> - desc = &vq->desc[head[entry_success]];
> -
> - vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
> - hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
> -
> - /* Discard first buffer as it is the virtio header */
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - desc = &vq->desc[desc->next];
> - vb_offset = 0;
> - vb_avail = desc->len;
> - } else {
> - vb_offset = vq->vhost_hlen;
> - vb_avail = desc->len - vb_offset;
> - }
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> -
> - used_idx = vq->last_used_idx & (vq->size - 1);
> -
> - if (entry_success < (free_entries - 1)) {
> - /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success+1]]);
> - rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
> - }
> -
> - /* Update used index buffer information. */
> - vq->used->ring[used_idx].id = head[entry_success];
> - vq->used->ring[used_idx].len = 0;
> -
> - /* Allocate an mbuf and populate the structure. */
> - m = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(m == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA,
> - "Failed to allocate memory for mbuf.\n");
> - break;
> - }
> - seg_offset = 0;
> - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> -
> - seg_num++;
> - cur = m;
> - prev = m;
> - while (cpy_len != 0) {
> - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
> - (void *)((uintptr_t)(vb_addr + vb_offset)),
> - cpy_len);
> -
> - seg_offset += cpy_len;
> - vb_offset += cpy_len;
> - vb_avail -= cpy_len;
> - seg_avail -= cpy_len;
> -
> - if (vb_avail != 0) {
> - /*
> - * The segment reachs to its end,
> - * while the virtio buffer in TX vring has
> - * more data to be copied.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /* Allocate mbuf and populate the structure. */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA, "Failed to "
> - "allocate memory for mbuf.\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> -
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - } else {
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - /*
> - * There are more virtio buffers in
> - * same vring entry need to be copied.
> - */
> - if (seg_avail == 0) {
> - /*
> - * The current segment hasn't
> - * room to accomodate more
> - * data.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /*
> - * Allocate an mbuf and
> - * populate the structure.
> - */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR,
> - VHOST_DATA,
> - "Failed to "
> - "allocate memory "
> - "for mbuf\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - }
> -
> - desc = &vq->desc[desc->next];
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> - vb_offset = 0;
> - vb_avail = desc->len;
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr,
> - desc->len, 0);
> - } else {
> - /* The whole packet completes. */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - vb_avail = 0;
> - }
> - }
> -
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> - }
> -
> - if (unlikely(alloc_err == 1))
> + for (i = 0; i < count; i++) {
> + m = copy_desc_to_mbuf(dev, vq, desc_indexes[i], mbuf_pool);
> + if (m == NULL)
> break;
> + pkts[i] = m;
>
> - m->nb_segs = seg_num;
> - if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
> - vhost_dequeue_offload(hdr, m);
> -
> - pkts[entry_success] = m;
> - vq->last_used_idx++;
> - entry_success++;
> + used_idx = vq->last_used_idx++ & (vq->size - 1);
> + vq->used->ring[used_idx].id = desc_indexes[i];
> + vq->used->ring[used_idx].len = 0;
> }
>
> rte_compiler_barrier();
> - vq->used->idx += entry_success;
> + vq->used->idx += i;
> +
> /* Kick guest if required. */
> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> eventfd_write(vq->callfd, (eventfd_t)1);
> - return entry_success;
> +
> + return i;
> }
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> + mbuf_avail = 0;
> + mbuf_offset = 0;
> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> + /* This desc reachs to its end, get the next one */
> + if (desc_avail == 0) {
> + desc = &vq->desc[desc->next];
> +
> + desc_addr = gpa_to_vva(dev, desc->addr);
> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> +
> + desc_offset = 0;
> + desc_avail = desc->len;
> +
> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> + }
> +
> + /*
> + * This mbuf reachs to its end, get a new one
> + * to hold more data.
> + */
> + if (mbuf_avail == 0) {
> + cur = rte_pktmbuf_alloc(mbuf_pool);
> + if (unlikely(!cur)) {
> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
> + "allocate memory for mbuf.\n");
> + if (head)
> + rte_pktmbuf_free(head);
> + return NULL;
> + }
We could always allocate the head mbuf before the loop, then we save the
following branch and make the code more streamlined.
It reminds me that this change prevents the possibility of mbuf bulk
allocation, one solution is we pass the head mbuf from an additional
parameter.
Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
> + if (!head) {
> + head = cur;
> + } else {
> + prev->next = cur;
> + prev->data_len = mbuf_offset;
> + head->nb_segs += 1;
> + }
> + head->pkt_len += mbuf_offset;
> + prev = cur;
> +
> + mbuf_offset = 0;
> + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> + }
> +
> + cpy_len = RTE_MIN(desc_avail, mbuf_avail);
> + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
> + (void *)((uintptr_t)(desc_addr + desc_offset)),
> + cpy_len);
> +
> + mbuf_avail -= cpy_len;
> + mbuf_offset += cpy_len;
> + desc_avail -= cpy_len;
> + desc_offset += cpy_len;
> + }
> +
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> [...]
CCed changchun, the author for the chained handling of desc and mbuf.
The change makes the code more readable, but i think the following
commit message is simple and enough.
>
> while (this_desc_is_not_drained_totally || has_next_desc) {
> if (this_desc_has_drained_totally) {
> this_desc = next_desc();
> }
>
> if (mbuf_has_no_room) {
> mbuf = allocate_a_new_mbuf();
> }
>
> COPY(mbuf, desc);
> }
>
> [...]
>
> This refactor makes the code much more readable (IMO), yet it reduces
> binary code size (nearly 2K).
I guess the reduced binary code size comes from reduced inline calls to
mbuf allocation.
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> The current rte_vhost_dequeue_burst() implementation is a bit messy
[...]
> +
> uint16_t
> rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> {
> - struct rte_mbuf *m, *prev;
> struct vhost_virtqueue *vq;
> - struct vring_desc *desc;
> - uint64_t vb_addr = 0;
> - uint64_t vb_net_hdr_addr = 0;
> - uint32_t head[MAX_PKT_BURST];
> + uint32_t desc_indexes[MAX_PKT_BURST];
indices
> uint32_t used_idx;
> uint32_t i;
> - uint16_t free_entries, entry_success = 0;
> + uint16_t free_entries;
> uint16_t avail_idx;
> - struct virtio_net_hdr *hdr = NULL;
> + struct rte_mbuf *m;
>
> if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
> RTE_LOG(ERR, VHOST_DATA,
> @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> return 0;
>
> avail_idx = *((volatile uint16_t *)&vq->avail->idx);
> -
> - /* If there are no available buffers then return. */
> - if (vq->last_used_idx == avail_idx)
> + free_entries = avail_idx - vq->last_used_idx;
> + if (free_entries == 0)
> return 0;
>
> - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
> - dev->device_fh);
> + LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh);
>
> - /* Prefetch available ring to retrieve head indexes. */
> - rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> + used_idx = vq->last_used_idx & (vq->size -1);
>
> - /*get the number of free entries in the ring*/
> - free_entries = (avail_idx - vq->last_used_idx);
> + /* Prefetch available ring to retrieve head indexes. */
> + rte_prefetch0(&vq->avail->ring[used_idx]);
>
> - free_entries = RTE_MIN(free_entries, count);
> - /* Limit to MAX_PKT_BURST. */
> - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> + count = RTE_MIN(count, MAX_PKT_BURST);
> + count = RTE_MIN(count, free_entries);
> + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n",
> + dev->device_fh, count);
>
> - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> - dev->device_fh, free_entries);
> /* Retrieve all of the head indexes first to avoid caching issues. */
> - for (i = 0; i < free_entries; i++)
> - head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
> + for (i = 0; i < count; i++) {
> + desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> + (vq->size - 1)];
> + }
>
> /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success]]);
> + rte_prefetch0(&vq->desc[desc_indexes[0]]);
> rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
>
> - while (entry_success < free_entries) {
> - uint32_t vb_avail, vb_offset;
> - uint32_t seg_avail, seg_offset;
> - uint32_t cpy_len;
> - uint32_t seg_num = 0;
> - struct rte_mbuf *cur;
> - uint8_t alloc_err = 0;
> -
> - desc = &vq->desc[head[entry_success]];
> -
> - vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
> - hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
> -
> - /* Discard first buffer as it is the virtio header */
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - desc = &vq->desc[desc->next];
> - vb_offset = 0;
> - vb_avail = desc->len;
> - } else {
> - vb_offset = vq->vhost_hlen;
> - vb_avail = desc->len - vb_offset;
> - }
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> -
> - used_idx = vq->last_used_idx & (vq->size - 1);
> -
> - if (entry_success < (free_entries - 1)) {
> - /* Prefetch descriptor index. */
> - rte_prefetch0(&vq->desc[head[entry_success+1]]);
> - rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
> - }
Why is this prefetch silently dropped in the patch?
> -
> - /* Update used index buffer information. */
> - vq->used->ring[used_idx].id = head[entry_success];
> - vq->used->ring[used_idx].len = 0;
> -
> - /* Allocate an mbuf and populate the structure. */
> - m = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(m == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA,
> - "Failed to allocate memory for mbuf.\n");
> - break;
> - }
> - seg_offset = 0;
> - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> -
> - seg_num++;
> - cur = m;
> - prev = m;
> - while (cpy_len != 0) {
> - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
> - (void *)((uintptr_t)(vb_addr + vb_offset)),
> - cpy_len);
> -
> - seg_offset += cpy_len;
> - vb_offset += cpy_len;
> - vb_avail -= cpy_len;
> - seg_avail -= cpy_len;
> -
> - if (vb_avail != 0) {
> - /*
> - * The segment reachs to its end,
> - * while the virtio buffer in TX vring has
> - * more data to be copied.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /* Allocate mbuf and populate the structure. */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR, VHOST_DATA, "Failed to "
> - "allocate memory for mbuf.\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> -
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - } else {
> - if (desc->flags & VRING_DESC_F_NEXT) {
> - /*
> - * There are more virtio buffers in
> - * same vring entry need to be copied.
> - */
> - if (seg_avail == 0) {
> - /*
> - * The current segment hasn't
> - * room to accomodate more
> - * data.
> - */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - /*
> - * Allocate an mbuf and
> - * populate the structure.
> - */
> - cur = rte_pktmbuf_alloc(mbuf_pool);
> - if (unlikely(cur == NULL)) {
> - RTE_LOG(ERR,
> - VHOST_DATA,
> - "Failed to "
> - "allocate memory "
> - "for mbuf\n");
> - rte_pktmbuf_free(m);
> - alloc_err = 1;
> - break;
> - }
> - seg_num++;
> - prev->next = cur;
> - prev = cur;
> - seg_offset = 0;
> - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
> - }
> -
> - desc = &vq->desc[desc->next];
> -
> - /* Buffer address translation. */
> - vb_addr = gpa_to_vva(dev, desc->addr);
> - /* Prefetch buffer address. */
> - rte_prefetch0((void *)(uintptr_t)vb_addr);
> - vb_offset = 0;
> - vb_avail = desc->len;
> -
> - PRINT_PACKET(dev, (uintptr_t)vb_addr,
> - desc->len, 0);
> - } else {
> - /* The whole packet completes. */
> - cur->data_len = seg_offset;
> - m->pkt_len += seg_offset;
> - vb_avail = 0;
> - }
> - }
> -
> - cpy_len = RTE_MIN(vb_avail, seg_avail);
> - }
> -
> - if (unlikely(alloc_err == 1))
> + for (i = 0; i < count; i++) {
> + m = copy_desc_to_mbuf(dev, vq, desc_indexes[i], mbuf_pool);
> + if (m == NULL)
add unlikely for every case not possible to happen
> break;
> + pkts[i] = m;
>
> - m->nb_segs = seg_num;
> - if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
> - vhost_dequeue_offload(hdr, m);
> -
> - pkts[entry_success] = m;
> - vq->last_used_idx++;
> - entry_success++;
> + used_idx = vq->last_used_idx++ & (vq->size - 1);
> + vq->used->ring[used_idx].id = desc_indexes[i];
> + vq->used->ring[used_idx].len = 0;
What is the correct value for ring[used_idx].len, the packet length or 0?
> }
>
> rte_compiler_barrier();
> - vq->used->idx += entry_success;
> + vq->used->idx += i;
> +
> /* Kick guest if required. */
> if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> eventfd_write(vq->callfd, (eventfd_t)1);
> - return entry_success;
> +
> + return i;
> }
On Thu, Mar 03, 2016 at 05:19:42PM +0000, Xie, Huawei wrote:
> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> > [...]
> CCed changchun, the author for the chained handling of desc and mbuf.
> The change makes the code more readable, but i think the following
> commit message is simple and enough.
Hmm.., my commit log tells a full story:
- What is the issue? (messy/logic twisted code)
- What the code does? (And what are the challenges: few tricky places)
- What's the proposed solution to fix it. (the below pseudo code)
And you suggest me to get rid of the first 2 items and leave 3rd item
(a solution) only?
--yliu
> >
> > while (this_desc_is_not_drained_totally || has_next_desc) {
> > if (this_desc_has_drained_totally) {
> > this_desc = next_desc();
> > }
> >
> > if (mbuf_has_no_room) {
> > mbuf = allocate_a_new_mbuf();
> > }
> >
> > COPY(mbuf, desc);
> > }
> >
> > [...]
> >
> > This refactor makes the code much more readable (IMO), yet it reduces
> > binary code size (nearly 2K).
> I guess the reduced binary code size comes from reduced inline calls to
> mbuf allocation.
>
On Thu, Mar 03, 2016 at 04:30:42PM +0000, Xie, Huawei wrote:
> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> > + mbuf_avail = 0;
> > + mbuf_offset = 0;
> > + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> > + /* This desc reachs to its end, get the next one */
> > + if (desc_avail == 0) {
> > + desc = &vq->desc[desc->next];
> > +
> > + desc_addr = gpa_to_vva(dev, desc->addr);
> > + rte_prefetch0((void *)(uintptr_t)desc_addr);
> > +
> > + desc_offset = 0;
> > + desc_avail = desc->len;
> > +
> > + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> > + }
> > +
> > + /*
> > + * This mbuf reachs to its end, get a new one
> > + * to hold more data.
> > + */
> > + if (mbuf_avail == 0) {
> > + cur = rte_pktmbuf_alloc(mbuf_pool);
> > + if (unlikely(!cur)) {
> > + RTE_LOG(ERR, VHOST_DATA, "Failed to "
> > + "allocate memory for mbuf.\n");
> > + if (head)
> > + rte_pktmbuf_free(head);
> > + return NULL;
> > + }
>
> We could always allocate the head mbuf before the loop, then we save the
> following branch and make the code more streamlined.
> It reminds me that this change prevents the possibility of mbuf bulk
> allocation, one solution is we pass the head mbuf from an additional
> parameter.
Yep, that's also something I have thought of.
> Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
I don't think so. It would benifit for the small packets. What if,
however, when TSO or jumbo frame is enabled that we have big packets?
--yliu
On Thu, Mar 03, 2016 at 04:21:19PM +0000, Xie, Huawei wrote:
> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> > The current rte_vhost_dequeue_burst() implementation is a bit messy
> > and logic twisted. And you could see repeat code here and there: it
> > invokes rte_pktmbuf_alloc() three times at three different places!
> >
> > However, rte_vhost_dequeue_burst() acutally does a simple job: copy
> > the packet data from vring desc to mbuf. What's tricky here is:
> >
> > - desc buff could be chained (by desc->next field), so that you need
> > fetch next one if current is wholly drained.
> >
> > - One mbuf could not be big enough to hold all desc buff, hence you
> > need to chain the mbuf as well, by the mbuf->next field.
> >
> > Even though, the logic could be simple. Here is the pseudo code.
> >
> > while (this_desc_is_not_drained_totally || has_next_desc) {
> > if (this_desc_has_drained_totally) {
> > this_desc = next_desc();
> > }
> >
> > if (mbuf_has_no_room) {
> > mbuf = allocate_a_new_mbuf();
> > }
> >
> > COPY(mbuf, desc);
> > }
> >
> > And this is how I refactored rte_vhost_dequeue_burst.
> >
> > Note that the old patch does a special handling for skipping virtio
> > header. However, that could be simply done by adjusting desc_avail
> > and desc_offset var:
> >
> > desc_avail = desc->len - vq->vhost_hlen;
> > desc_offset = vq->vhost_hlen;
> >
> > This refactor makes the code much more readable (IMO), yet it reduces
> > binary code size (nearly 2K).
> >
> > Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > ---
> >
> > v2: - fix potential NULL dereference bug of var "prev" and "head"
> > ---
> > lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
> > 1 file changed, 116 insertions(+), 181 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index 5e7e5b1..d5cd0fa 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> > }
> > }
> >
> > +static inline struct rte_mbuf *
> > +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > + uint16_t desc_idx, struct rte_mempool *mbuf_pool)
> > +{
> > + struct vring_desc *desc;
> > + uint64_t desc_addr;
> > + uint32_t desc_avail, desc_offset;
> > + uint32_t mbuf_avail, mbuf_offset;
> > + uint32_t cpy_len;
> > + struct rte_mbuf *head = NULL;
> > + struct rte_mbuf *cur = NULL, *prev = NULL;
> > + struct virtio_net_hdr *hdr;
> > +
> > + desc = &vq->desc[desc_idx];
> > + desc_addr = gpa_to_vva(dev, desc->addr);
> > + rte_prefetch0((void *)(uintptr_t)desc_addr);
> > +
> > + /* Retrieve virtio net header */
> > + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
> > + desc_avail = desc->len - vq->vhost_hlen;
>
> There is a serious bug here, desc->len - vq->vhost_len could overflow.
> VM could easily create this case. Let us fix it here.
Nope, this issue has been there since the beginning, and this patch
is a refactor: we should not bring any functional changes. Therefore,
we should not fix it here.
And actually, it's been fixed in the 6th patch in this series:
[PATCH v2 6/7] vhost: do sanity check for desc->len
--yliu
On Thu, Mar 03, 2016 at 05:40:14PM +0000, Xie, Huawei wrote:
> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> > The current rte_vhost_dequeue_burst() implementation is a bit messy
> [...]
> > +
> > uint16_t
> > rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> > struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
> > {
> > - struct rte_mbuf *m, *prev;
> > struct vhost_virtqueue *vq;
> > - struct vring_desc *desc;
> > - uint64_t vb_addr = 0;
> > - uint64_t vb_net_hdr_addr = 0;
> > - uint32_t head[MAX_PKT_BURST];
> > + uint32_t desc_indexes[MAX_PKT_BURST];
>
> indices
http://dictionary.reference.com/browse/index
index
noun, plural indexes, indices
>
>
> > uint32_t used_idx;
> > uint32_t i;
> > - uint16_t free_entries, entry_success = 0;
> > + uint16_t free_entries;
> > uint16_t avail_idx;
> > - struct virtio_net_hdr *hdr = NULL;
> > + struct rte_mbuf *m;
> >
> > if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
> > RTE_LOG(ERR, VHOST_DATA,
> > @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
> > return 0;
> >
> > - if (entry_success < (free_entries - 1)) {
> > - /* Prefetch descriptor index. */
> > - rte_prefetch0(&vq->desc[head[entry_success+1]]);
> > - rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
> > - }
>
> Why is this prefetch silently dropped in the patch?
Oops, good catching. Will fix it. Thanks.
> > break;
> > + pkts[i] = m;
> >
> > - m->nb_segs = seg_num;
> > - if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
> > - vhost_dequeue_offload(hdr, m);
> > -
> > - pkts[entry_success] = m;
> > - vq->last_used_idx++;
> > - entry_success++;
> > + used_idx = vq->last_used_idx++ & (vq->size - 1);
> > + vq->used->ring[used_idx].id = desc_indexes[i];
> > + vq->used->ring[used_idx].len = 0;
>
> What is the correct value for ring[used_idx].len, the packet length or 0?
Good question. I didn't notice that before. Sounds buggy to me. However,
that's from the old code. Will check it.
--yliu
On 3/4/2016 10:19 AM, Yuanhan Liu wrote:
> On Thu, Mar 03, 2016 at 04:21:19PM +0000, Xie, Huawei wrote:
>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
>>> The current rte_vhost_dequeue_burst() implementation is a bit messy
>>> and logic twisted. And you could see repeat code here and there: it
>>> invokes rte_pktmbuf_alloc() three times at three different places!
>>>
>>> However, rte_vhost_dequeue_burst() acutally does a simple job: copy
>>> the packet data from vring desc to mbuf. What's tricky here is:
>>>
>>> - desc buff could be chained (by desc->next field), so that you need
>>> fetch next one if current is wholly drained.
>>>
>>> - One mbuf could not be big enough to hold all desc buff, hence you
>>> need to chain the mbuf as well, by the mbuf->next field.
>>>
>>> Even though, the logic could be simple. Here is the pseudo code.
>>>
>>> while (this_desc_is_not_drained_totally || has_next_desc) {
>>> if (this_desc_has_drained_totally) {
>>> this_desc = next_desc();
>>> }
>>>
>>> if (mbuf_has_no_room) {
>>> mbuf = allocate_a_new_mbuf();
>>> }
>>>
>>> COPY(mbuf, desc);
>>> }
>>>
>>> And this is how I refactored rte_vhost_dequeue_burst.
>>>
>>> Note that the old patch does a special handling for skipping virtio
>>> header. However, that could be simply done by adjusting desc_avail
>>> and desc_offset var:
>>>
>>> desc_avail = desc->len - vq->vhost_hlen;
>>> desc_offset = vq->vhost_hlen;
>>>
>>> This refactor makes the code much more readable (IMO), yet it reduces
>>> binary code size (nearly 2K).
>>>
>>> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>> ---
>>>
>>> v2: - fix potential NULL dereference bug of var "prev" and "head"
>>> ---
>>> lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
>>> 1 file changed, 116 insertions(+), 181 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
>>> index 5e7e5b1..d5cd0fa 100644
>>> --- a/lib/librte_vhost/vhost_rxtx.c
>>> +++ b/lib/librte_vhost/vhost_rxtx.c
>>> @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
>>> }
>>> }
>>>
>>> +static inline struct rte_mbuf *
>>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>> + uint16_t desc_idx, struct rte_mempool *mbuf_pool)
>>> +{
>>> + struct vring_desc *desc;
>>> + uint64_t desc_addr;
>>> + uint32_t desc_avail, desc_offset;
>>> + uint32_t mbuf_avail, mbuf_offset;
>>> + uint32_t cpy_len;
>>> + struct rte_mbuf *head = NULL;
>>> + struct rte_mbuf *cur = NULL, *prev = NULL;
>>> + struct virtio_net_hdr *hdr;
>>> +
>>> + desc = &vq->desc[desc_idx];
>>> + desc_addr = gpa_to_vva(dev, desc->addr);
>>> + rte_prefetch0((void *)(uintptr_t)desc_addr);
>>> +
>>> + /* Retrieve virtio net header */
>>> + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
>>> + desc_avail = desc->len - vq->vhost_hlen;
>> There is a serious bug here, desc->len - vq->vhost_len could overflow.
>> VM could easily create this case. Let us fix it here.
> Nope, this issue has been there since the beginning, and this patch
> is a refactor: we should not bring any functional changes. Therefore,
> we should not fix it here.
No, I don't mean exactly fixing in this patch but in this series.
Besides, from refactoring's perspective, actually we could make things
further much simpler and more readable. Both the desc chains and mbuf
could be converted into iovec, then both dequeue(copy_desc_to_mbuf) and
enqueue(copy_mbuf_to_desc) could use the commonly used iovec copying
algorithms As data path are performance oriented, let us stop here.
>
> And actually, it's been fixed in the 6th patch in this series:
Will check that.
>
> [PATCH v2 6/7] vhost: do sanity check for desc->len
>
> --yliu
>
On 3/4/2016 10:15 AM, Yuanhan Liu wrote:
> On Thu, Mar 03, 2016 at 04:30:42PM +0000, Xie, Huawei wrote:
>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
>>> + mbuf_avail = 0;
>>> + mbuf_offset = 0;
>>> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
>>> + /* This desc reachs to its end, get the next one */
>>> + if (desc_avail == 0) {
>>> + desc = &vq->desc[desc->next];
>>> +
>>> + desc_addr = gpa_to_vva(dev, desc->addr);
>>> + rte_prefetch0((void *)(uintptr_t)desc_addr);
>>> +
>>> + desc_offset = 0;
>>> + desc_avail = desc->len;
>>> +
>>> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
>>> + }
>>> +
>>> + /*
>>> + * This mbuf reachs to its end, get a new one
>>> + * to hold more data.
>>> + */
>>> + if (mbuf_avail == 0) {
>>> + cur = rte_pktmbuf_alloc(mbuf_pool);
>>> + if (unlikely(!cur)) {
>>> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
>>> + "allocate memory for mbuf.\n");
>>> + if (head)
>>> + rte_pktmbuf_free(head);
>>> + return NULL;
>>> + }
>> We could always allocate the head mbuf before the loop, then we save the
>> following branch and make the code more streamlined.
>> It reminds me that this change prevents the possibility of mbuf bulk
>> allocation, one solution is we pass the head mbuf from an additional
>> parameter.
> Yep, that's also something I have thought of.
>
>> Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
> I don't think so. It would benifit for the small packets. What if,
> however, when TSO or jumbo frame is enabled that we have big packets?
Prefer to favor the path that packet could fit in one mbuf.
Btw, not specially to this, return "m = copy_desc_to_mbuf(dev, vq,
desc_indexes[i], mbuf_pool)", failure case is unlikely to happen, so add
unlikely for the check if (m == NULL) there. Please check all branches
elsewhere.
> --yliu
>
On Mon, Mar 07, 2016 at 02:19:54AM +0000, Xie, Huawei wrote:
> On 3/4/2016 10:19 AM, Yuanhan Liu wrote:
> > On Thu, Mar 03, 2016 at 04:21:19PM +0000, Xie, Huawei wrote:
> >> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> >>> The current rte_vhost_dequeue_burst() implementation is a bit messy
> >>> and logic twisted. And you could see repeat code here and there: it
> >>> invokes rte_pktmbuf_alloc() three times at three different places!
> >>>
> >>> However, rte_vhost_dequeue_burst() acutally does a simple job: copy
> >>> the packet data from vring desc to mbuf. What's tricky here is:
> >>>
> >>> - desc buff could be chained (by desc->next field), so that you need
> >>> fetch next one if current is wholly drained.
> >>>
> >>> - One mbuf could not be big enough to hold all desc buff, hence you
> >>> need to chain the mbuf as well, by the mbuf->next field.
> >>>
> >>> Even though, the logic could be simple. Here is the pseudo code.
> >>>
> >>> while (this_desc_is_not_drained_totally || has_next_desc) {
> >>> if (this_desc_has_drained_totally) {
> >>> this_desc = next_desc();
> >>> }
> >>>
> >>> if (mbuf_has_no_room) {
> >>> mbuf = allocate_a_new_mbuf();
> >>> }
> >>>
> >>> COPY(mbuf, desc);
> >>> }
> >>>
> >>> And this is how I refactored rte_vhost_dequeue_burst.
> >>>
> >>> Note that the old patch does a special handling for skipping virtio
> >>> header. However, that could be simply done by adjusting desc_avail
> >>> and desc_offset var:
> >>>
> >>> desc_avail = desc->len - vq->vhost_hlen;
> >>> desc_offset = vq->vhost_hlen;
> >>>
> >>> This refactor makes the code much more readable (IMO), yet it reduces
> >>> binary code size (nearly 2K).
> >>>
> >>> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >>> ---
> >>>
> >>> v2: - fix potential NULL dereference bug of var "prev" and "head"
> >>> ---
> >>> lib/librte_vhost/vhost_rxtx.c | 297 +++++++++++++++++-------------------------
> >>> 1 file changed, 116 insertions(+), 181 deletions(-)
> >>>
> >>> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> >>> index 5e7e5b1..d5cd0fa 100644
> >>> --- a/lib/librte_vhost/vhost_rxtx.c
> >>> +++ b/lib/librte_vhost/vhost_rxtx.c
> >>> @@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
> >>> }
> >>> }
> >>>
> >>> +static inline struct rte_mbuf *
> >>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >>> + uint16_t desc_idx, struct rte_mempool *mbuf_pool)
> >>> +{
> >>> + struct vring_desc *desc;
> >>> + uint64_t desc_addr;
> >>> + uint32_t desc_avail, desc_offset;
> >>> + uint32_t mbuf_avail, mbuf_offset;
> >>> + uint32_t cpy_len;
> >>> + struct rte_mbuf *head = NULL;
> >>> + struct rte_mbuf *cur = NULL, *prev = NULL;
> >>> + struct virtio_net_hdr *hdr;
> >>> +
> >>> + desc = &vq->desc[desc_idx];
> >>> + desc_addr = gpa_to_vva(dev, desc->addr);
> >>> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> >>> +
> >>> + /* Retrieve virtio net header */
> >>> + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
> >>> + desc_avail = desc->len - vq->vhost_hlen;
> >> There is a serious bug here, desc->len - vq->vhost_len could overflow.
> >> VM could easily create this case. Let us fix it here.
> > Nope, this issue has been there since the beginning, and this patch
> > is a refactor: we should not bring any functional changes. Therefore,
> > we should not fix it here.
>
> No, I don't mean exactly fixing in this patch but in this series.
>
> Besides, from refactoring's perspective, actually we could make things
> further much simpler and more readable. Both the desc chains and mbuf
> could be converted into iovec, then both dequeue(copy_desc_to_mbuf) and
> enqueue(copy_mbuf_to_desc) could use the commonly used iovec copying
> algorithms As data path are performance oriented, let us stop here.
Agreed, I had same performance concern on further simplication,
therefore I didn't go further.
> >
> > And actually, it's been fixed in the 6th patch in this series:
>
> Will check that.
Do you have other comments for other patches? I'm considering to send
a new version recently, say maybe tomorrow.
--yliu
On Mon, Mar 07, 2016 at 02:32:46AM +0000, Xie, Huawei wrote:
> On 3/4/2016 10:15 AM, Yuanhan Liu wrote:
> > On Thu, Mar 03, 2016 at 04:30:42PM +0000, Xie, Huawei wrote:
> >> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> >>> + mbuf_avail = 0;
> >>> + mbuf_offset = 0;
> >>> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
> >>> + /* This desc reachs to its end, get the next one */
> >>> + if (desc_avail == 0) {
> >>> + desc = &vq->desc[desc->next];
> >>> +
> >>> + desc_addr = gpa_to_vva(dev, desc->addr);
> >>> + rte_prefetch0((void *)(uintptr_t)desc_addr);
> >>> +
> >>> + desc_offset = 0;
> >>> + desc_avail = desc->len;
> >>> +
> >>> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
> >>> + }
> >>> +
> >>> + /*
> >>> + * This mbuf reachs to its end, get a new one
> >>> + * to hold more data.
> >>> + */
> >>> + if (mbuf_avail == 0) {
> >>> + cur = rte_pktmbuf_alloc(mbuf_pool);
> >>> + if (unlikely(!cur)) {
> >>> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
> >>> + "allocate memory for mbuf.\n");
> >>> + if (head)
> >>> + rte_pktmbuf_free(head);
> >>> + return NULL;
> >>> + }
> >> We could always allocate the head mbuf before the loop, then we save the
> >> following branch and make the code more streamlined.
> >> It reminds me that this change prevents the possibility of mbuf bulk
> >> allocation, one solution is we pass the head mbuf from an additional
> >> parameter.
> > Yep, that's also something I have thought of.
> >
> >> Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
> > I don't think so. It would benifit for the small packets. What if,
> > however, when TSO or jumbo frame is enabled that we have big packets?
>
> Prefer to favor the path that packet could fit in one mbuf.
Hmmm, why? While I know that TSO and mergeable buf is disable by default
in our vhost example vhost-switch, they are enabled by default in real
life.
> Btw, not specially to this, return "m = copy_desc_to_mbuf(dev, vq,
> desc_indexes[i], mbuf_pool)", failure case is unlikely to happen, so add
> unlikely for the check if (m == NULL) there. Please check all branches
> elsewhere.
Thanks for the remind, will have a detail check.
--yliu
On 3/4/2016 10:10 AM, Yuanhan Liu wrote:
> On Thu, Mar 03, 2016 at 05:19:42PM +0000, Xie, Huawei wrote:
>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
>>> [...]
>> CCed changchun, the author for the chained handling of desc and mbuf.
>> The change makes the code more readable, but i think the following
>> commit message is simple and enough.
> Hmm.., my commit log tells a full story:
>
> - What is the issue? (messy/logic twisted code)
>
> - What the code does? (And what are the challenges: few tricky places)
>
> - What's the proposed solution to fix it. (the below pseudo code)
>
> And you suggest me to get rid of the first 2 items and leave 3rd item
> (a solution) only?
The following are simple and enough with just one additional statement
for the repeated mbuf allocation or your twisted. Other commit messages
are overly duplicated. Just my personal opinion. Up to you.
To this special case, for example, we could make both mbuf and
vring_desc chains into iovec, then use commonly used iovec copy
algorithms for both dequeue and enqueue, which further makes the code
much simpler and more readable. For this change, one or two sentences
are clear to me.
> --yliu
>
>>> while (this_desc_is_not_drained_totally || has_next_desc) {
>>> if (this_desc_has_drained_totally) {
>>> this_desc = next_desc();
>>> }
>>>
>>> if (mbuf_has_no_room) {
>>> mbuf = allocate_a_new_mbuf();
>>> }
>>>
>>> COPY(mbuf, desc);
>>> }
>>>
>>> [...]
>>>
>>> This refactor makes the code much more readable (IMO), yet it reduces
>>> binary code size (nearly 2K).
>> I guess the reduced binary code size comes from reduced inline calls to
>> mbuf allocation.
>>
On 3/7/2016 10:47 AM, Yuanhan Liu wrote:
> On Mon, Mar 07, 2016 at 02:32:46AM +0000, Xie, Huawei wrote:
>> On 3/4/2016 10:15 AM, Yuanhan Liu wrote:
>>> On Thu, Mar 03, 2016 at 04:30:42PM +0000, Xie, Huawei wrote:
>>>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
>>>>> + mbuf_avail = 0;
>>>>> + mbuf_offset = 0;
>>>>> + while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
>>>>> + /* This desc reachs to its end, get the next one */
>>>>> + if (desc_avail == 0) {
>>>>> + desc = &vq->desc[desc->next];
>>>>> +
>>>>> + desc_addr = gpa_to_vva(dev, desc->addr);
>>>>> + rte_prefetch0((void *)(uintptr_t)desc_addr);
>>>>> +
>>>>> + desc_offset = 0;
>>>>> + desc_avail = desc->len;
>>>>> +
>>>>> + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> + * This mbuf reachs to its end, get a new one
>>>>> + * to hold more data.
>>>>> + */
>>>>> + if (mbuf_avail == 0) {
>>>>> + cur = rte_pktmbuf_alloc(mbuf_pool);
>>>>> + if (unlikely(!cur)) {
>>>>> + RTE_LOG(ERR, VHOST_DATA, "Failed to "
>>>>> + "allocate memory for mbuf.\n");
>>>>> + if (head)
>>>>> + rte_pktmbuf_free(head);
>>>>> + return NULL;
>>>>> + }
>>>> We could always allocate the head mbuf before the loop, then we save the
>>>> following branch and make the code more streamlined.
>>>> It reminds me that this change prevents the possibility of mbuf bulk
>>>> allocation, one solution is we pass the head mbuf from an additional
>>>> parameter.
>>> Yep, that's also something I have thought of.
>>>
>>>> Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
>>> I don't think so. It would benifit for the small packets. What if,
>>> however, when TSO or jumbo frame is enabled that we have big packets?
>> Prefer to favor the path that packet could fit in one mbuf.
> Hmmm, why? While I know that TSO and mergeable buf is disable by default
> in our vhost example vhost-switch, they are enabled by default in real
> life.
mergable is only meaningful in RX path.
If you mean big packets in TX path, i personally favor the path that
packet fits in one mbuf.
>> Btw, not specially to this, return "m = copy_desc_to_mbuf(dev, vq,
>> desc_indexes[i], mbuf_pool)", failure case is unlikely to happen, so add
>> unlikely for the check if (m == NULL) there. Please check all branches
>> elsewhere.
> Thanks for the remind, will have a detail check.
>
> --yliu
>
On 3/4/2016 10:30 AM, Yuanhan Liu wrote:
> On Thu, Mar 03, 2016 at 05:40:14PM +0000, Xie, Huawei wrote:
>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
>>> The current rte_vhost_dequeue_burst() implementation is a bit messy
>> [...]
>>> +
>>> uint16_t
>>> rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>>> struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
>>> {
>>> - struct rte_mbuf *m, *prev;
>>> struct vhost_virtqueue *vq;
>>> - struct vring_desc *desc;
>>> - uint64_t vb_addr = 0;
>>> - uint64_t vb_net_hdr_addr = 0;
>>> - uint32_t head[MAX_PKT_BURST];
>>> + uint32_t desc_indexes[MAX_PKT_BURST];
>> indices
> http://dictionary.reference.com/browse/index
>
> index
> noun, plural indexes, indices
ok, i see both two are used.
>>
>>> uint32_t used_idx;
>>> uint32_t i;
>>> - uint16_t free_entries, entry_success = 0;
>>> + uint16_t free_entries;
>>> uint16_t avail_idx;
>>> - struct virtio_net_hdr *hdr = NULL;
>>> + struct rte_mbuf *m;
>>>
>>> if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
>>> RTE_LOG(ERR, VHOST_DATA,
>>> @@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>>> return 0;
>>>
>>> - if (entry_success < (free_entries - 1)) {
>>> - /* Prefetch descriptor index. */
>>> - rte_prefetch0(&vq->desc[head[entry_success+1]]);
>>> - rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
>>> - }
>> Why is this prefetch silently dropped in the patch?
> Oops, good catching. Will fix it. Thanks.
>
>
>>> break;
>>> + pkts[i] = m;
>>>
>>> - m->nb_segs = seg_num;
>>> - if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
>>> - vhost_dequeue_offload(hdr, m);
>>> -
>>> - pkts[entry_success] = m;
>>> - vq->last_used_idx++;
>>> - entry_success++;
>>> + used_idx = vq->last_used_idx++ & (vq->size - 1);
>>> + vq->used->ring[used_idx].id = desc_indexes[i];
>>> + vq->used->ring[used_idx].len = 0;
>> What is the correct value for ring[used_idx].len, the packet length or 0?
> Good question. I didn't notice that before. Sounds buggy to me. However,
> that's from the old code. Will check it.
Yes, i knew it is in old code also. Thanks.
> --yliu
>
On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> + mbuf_avail = 0;
> + mbuf_offset = 0;
one cs nit, put it at the definition.
On Mon, Mar 07, 2016 at 02:59:55AM +0000, Xie, Huawei wrote:
> On 3/7/2016 10:47 AM, Yuanhan Liu wrote:
> > On Mon, Mar 07, 2016 at 02:32:46AM +0000, Xie, Huawei wrote:
> >> On 3/4/2016 10:15 AM, Yuanhan Liu wrote:
> >>> On Thu, Mar 03, 2016 at 04:30:42PM +0000, Xie, Huawei wrote:
> >>>> On 2/18/2016 9:48 PM, Yuanhan Liu wrote:
> >>>> We could always allocate the head mbuf before the loop, then we save the
> >>>> following branch and make the code more streamlined.
> >>>> It reminds me that this change prevents the possibility of mbuf bulk
> >>>> allocation, one solution is we pass the head mbuf from an additional
> >>>> parameter.
> >>> Yep, that's also something I have thought of.
> >>>
> >>>> Btw, put unlikely before the check of mbuf_avail and checks elsewhere.
> >>> I don't think so. It would benifit for the small packets. What if,
> >>> however, when TSO or jumbo frame is enabled that we have big packets?
> >> Prefer to favor the path that packet could fit in one mbuf.
> > Hmmm, why? While I know that TSO and mergeable buf is disable by default
> > in our vhost example vhost-switch, they are enabled by default in real
> > life.
>
> mergable is only meaningful in RX path.
> If you mean big packets in TX path,
Sorry, and yes, I meant that.
> i personally favor the path that
> packet fits in one mbuf.
Sorry, that will not convince me.
--yliu
@@ -702,21 +702,104 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
}
}
+static inline struct rte_mbuf *
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t desc_idx, struct rte_mempool *mbuf_pool)
+{
+ struct vring_desc *desc;
+ uint64_t desc_addr;
+ uint32_t desc_avail, desc_offset;
+ uint32_t mbuf_avail, mbuf_offset;
+ uint32_t cpy_len;
+ struct rte_mbuf *head = NULL;
+ struct rte_mbuf *cur = NULL, *prev = NULL;
+ struct virtio_net_hdr *hdr;
+
+ desc = &vq->desc[desc_idx];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ /* Retrieve virtio net header */
+ hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
+ desc_avail = desc->len - vq->vhost_hlen;
+ desc_offset = vq->vhost_hlen;
+
+ mbuf_avail = 0;
+ mbuf_offset = 0;
+ while (desc_avail || (desc->flags & VRING_DESC_F_NEXT) != 0) {
+ /* This desc reachs to its end, get the next one */
+ if (desc_avail == 0) {
+ desc = &vq->desc[desc->next];
+
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ desc_offset = 0;
+ desc_avail = desc->len;
+
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+ }
+
+ /*
+ * This mbuf reachs to its end, get a new one
+ * to hold more data.
+ */
+ if (mbuf_avail == 0) {
+ cur = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(!cur)) {
+ RTE_LOG(ERR, VHOST_DATA, "Failed to "
+ "allocate memory for mbuf.\n");
+ if (head)
+ rte_pktmbuf_free(head);
+ return NULL;
+ }
+ if (!head) {
+ head = cur;
+ } else {
+ prev->next = cur;
+ prev->data_len = mbuf_offset;
+ head->nb_segs += 1;
+ }
+ head->pkt_len += mbuf_offset;
+ prev = cur;
+
+ mbuf_offset = 0;
+ mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+ }
+
+ cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+ rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
+ (void *)((uintptr_t)(desc_addr + desc_offset)),
+ cpy_len);
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ desc_avail -= cpy_len;
+ desc_offset += cpy_len;
+ }
+
+ if (prev) {
+ prev->data_len = mbuf_offset;
+ head->pkt_len += mbuf_offset;
+
+ if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
+ vhost_dequeue_offload(hdr, head);
+ }
+
+ return head;
+}
+
uint16_t
rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
{
- struct rte_mbuf *m, *prev;
struct vhost_virtqueue *vq;
- struct vring_desc *desc;
- uint64_t vb_addr = 0;
- uint64_t vb_net_hdr_addr = 0;
- uint32_t head[MAX_PKT_BURST];
+ uint32_t desc_indexes[MAX_PKT_BURST];
uint32_t used_idx;
uint32_t i;
- uint16_t free_entries, entry_success = 0;
+ uint16_t free_entries;
uint16_t avail_idx;
- struct virtio_net_hdr *hdr = NULL;
+ struct rte_mbuf *m;
if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
RTE_LOG(ERR, VHOST_DATA,
@@ -730,197 +813,49 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
return 0;
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-
- /* If there are no available buffers then return. */
- if (vq->last_used_idx == avail_idx)
+ free_entries = avail_idx - vq->last_used_idx;
+ if (free_entries == 0)
return 0;
- LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
- dev->device_fh);
+ LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, dev->device_fh);
- /* Prefetch available ring to retrieve head indexes. */
- rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
+ used_idx = vq->last_used_idx & (vq->size -1);
- /*get the number of free entries in the ring*/
- free_entries = (avail_idx - vq->last_used_idx);
+ /* Prefetch available ring to retrieve head indexes. */
+ rte_prefetch0(&vq->avail->ring[used_idx]);
- free_entries = RTE_MIN(free_entries, count);
- /* Limit to MAX_PKT_BURST. */
- free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
+ count = RTE_MIN(count, MAX_PKT_BURST);
+ count = RTE_MIN(count, free_entries);
+ LOG_DEBUG(VHOST_DATA, "(%"PRIu64") about to dequeue %u buffers\n",
+ dev->device_fh, count);
- LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
- dev->device_fh, free_entries);
/* Retrieve all of the head indexes first to avoid caching issues. */
- for (i = 0; i < free_entries; i++)
- head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
+ for (i = 0; i < count; i++) {
+ desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
+ (vq->size - 1)];
+ }
/* Prefetch descriptor index. */
- rte_prefetch0(&vq->desc[head[entry_success]]);
+ rte_prefetch0(&vq->desc[desc_indexes[0]]);
rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
- while (entry_success < free_entries) {
- uint32_t vb_avail, vb_offset;
- uint32_t seg_avail, seg_offset;
- uint32_t cpy_len;
- uint32_t seg_num = 0;
- struct rte_mbuf *cur;
- uint8_t alloc_err = 0;
-
- desc = &vq->desc[head[entry_success]];
-
- vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
- hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
-
- /* Discard first buffer as it is the virtio header */
- if (desc->flags & VRING_DESC_F_NEXT) {
- desc = &vq->desc[desc->next];
- vb_offset = 0;
- vb_avail = desc->len;
- } else {
- vb_offset = vq->vhost_hlen;
- vb_avail = desc->len - vb_offset;
- }
-
- /* Buffer address translation. */
- vb_addr = gpa_to_vva(dev, desc->addr);
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)vb_addr);
-
- used_idx = vq->last_used_idx & (vq->size - 1);
-
- if (entry_success < (free_entries - 1)) {
- /* Prefetch descriptor index. */
- rte_prefetch0(&vq->desc[head[entry_success+1]]);
- rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
- }
-
- /* Update used index buffer information. */
- vq->used->ring[used_idx].id = head[entry_success];
- vq->used->ring[used_idx].len = 0;
-
- /* Allocate an mbuf and populate the structure. */
- m = rte_pktmbuf_alloc(mbuf_pool);
- if (unlikely(m == NULL)) {
- RTE_LOG(ERR, VHOST_DATA,
- "Failed to allocate memory for mbuf.\n");
- break;
- }
- seg_offset = 0;
- seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
- cpy_len = RTE_MIN(vb_avail, seg_avail);
-
- PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
-
- seg_num++;
- cur = m;
- prev = m;
- while (cpy_len != 0) {
- rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
- (void *)((uintptr_t)(vb_addr + vb_offset)),
- cpy_len);
-
- seg_offset += cpy_len;
- vb_offset += cpy_len;
- vb_avail -= cpy_len;
- seg_avail -= cpy_len;
-
- if (vb_avail != 0) {
- /*
- * The segment reachs to its end,
- * while the virtio buffer in TX vring has
- * more data to be copied.
- */
- cur->data_len = seg_offset;
- m->pkt_len += seg_offset;
- /* Allocate mbuf and populate the structure. */
- cur = rte_pktmbuf_alloc(mbuf_pool);
- if (unlikely(cur == NULL)) {
- RTE_LOG(ERR, VHOST_DATA, "Failed to "
- "allocate memory for mbuf.\n");
- rte_pktmbuf_free(m);
- alloc_err = 1;
- break;
- }
-
- seg_num++;
- prev->next = cur;
- prev = cur;
- seg_offset = 0;
- seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
- } else {
- if (desc->flags & VRING_DESC_F_NEXT) {
- /*
- * There are more virtio buffers in
- * same vring entry need to be copied.
- */
- if (seg_avail == 0) {
- /*
- * The current segment hasn't
- * room to accomodate more
- * data.
- */
- cur->data_len = seg_offset;
- m->pkt_len += seg_offset;
- /*
- * Allocate an mbuf and
- * populate the structure.
- */
- cur = rte_pktmbuf_alloc(mbuf_pool);
- if (unlikely(cur == NULL)) {
- RTE_LOG(ERR,
- VHOST_DATA,
- "Failed to "
- "allocate memory "
- "for mbuf\n");
- rte_pktmbuf_free(m);
- alloc_err = 1;
- break;
- }
- seg_num++;
- prev->next = cur;
- prev = cur;
- seg_offset = 0;
- seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
- }
-
- desc = &vq->desc[desc->next];
-
- /* Buffer address translation. */
- vb_addr = gpa_to_vva(dev, desc->addr);
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)vb_addr);
- vb_offset = 0;
- vb_avail = desc->len;
-
- PRINT_PACKET(dev, (uintptr_t)vb_addr,
- desc->len, 0);
- } else {
- /* The whole packet completes. */
- cur->data_len = seg_offset;
- m->pkt_len += seg_offset;
- vb_avail = 0;
- }
- }
-
- cpy_len = RTE_MIN(vb_avail, seg_avail);
- }
-
- if (unlikely(alloc_err == 1))
+ for (i = 0; i < count; i++) {
+ m = copy_desc_to_mbuf(dev, vq, desc_indexes[i], mbuf_pool);
+ if (m == NULL)
break;
+ pkts[i] = m;
- m->nb_segs = seg_num;
- if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
- vhost_dequeue_offload(hdr, m);
-
- pkts[entry_success] = m;
- vq->last_used_idx++;
- entry_success++;
+ used_idx = vq->last_used_idx++ & (vq->size - 1);
+ vq->used->ring[used_idx].id = desc_indexes[i];
+ vq->used->ring[used_idx].len = 0;
}
rte_compiler_barrier();
- vq->used->idx += entry_success;
+ vq->used->idx += i;
+
/* Kick guest if required. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
eventfd_write(vq->callfd, (eventfd_t)1);
- return entry_success;
+
+ return i;
}