[v3] vhost: fix async copy fail on multi-page buffers
Checks
Commit Message
From: Patrick Fu <patrick.fu@intel.com>
Async copy fails when single ring buffer vector is splited on multiple
physical pages. This happens because current hpa address translation
function doesn't handle multi-page buffers. A new gpa to hpa address
conversion function, which returns the hpa on the first hitting host
pages, is implemented in this patch. Async data path recursively calls
this new function to construct a multi-segments async copy descriptor
for ring buffers crossing physical page boundaries.
Fixes: cd6760da1076 ("vhost: introduce async enqueue for split ring")
Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
v2:
- change commit message and title
- v1 patch used CPU to copy multi-page buffers; v2 patch split the
copy into multiple async copy segments whenever possible
v3:
- added fixline
lib/librte_vhost/vhost.h | 50 +++++++++++++++++++++++++++++++++++
lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
2 files changed, 75 insertions(+), 15 deletions(-)
Comments
Hi Patrick,
> -----Original Message-----
> From: Fu, Patrick <patrick.fu@intel.com>
> Sent: Monday, July 27, 2020 2:33 PM
> To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> <chenbo.xia@intel.com>
> Cc: Fu, Patrick <patrick.fu@intel.com>
> Subject: [PATCH v3] vhost: fix async copy fail on multi-page buffers
>
> From: Patrick Fu <patrick.fu@intel.com>
>
> Async copy fails when single ring buffer vector is splited on multiple physical
> pages. This happens because current hpa address translation function doesn't
> handle multi-page buffers. A new gpa to hpa address conversion function, which
> returns the hpa on the first hitting host pages, is implemented in this patch.
> Async data path recursively calls this new function to construct a multi-segments
> async copy descriptor for ring buffers crossing physical page boundaries.
>
> Fixes: cd6760da1076 ("vhost: introduce async enqueue for split ring")
>
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
> v2:
> - change commit message and title
> - v1 patch used CPU to copy multi-page buffers; v2 patch split the copy into
> multiple async copy segments whenever possible
>
> v3:
> - added fixline
>
> lib/librte_vhost/vhost.h | 50 +++++++++++++++++++++++++++++++++++
> lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
> 2 files changed, 75 insertions(+), 15 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> 0f7212f88..05c202a57 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -616,6 +616,56 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa,
> uint64_t size)
> return 0;
> }
>
> +static __rte_always_inline rte_iova_t
> +gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
> + uint64_t gpa_size, uint64_t *hpa_size) {
> + uint32_t i;
> + struct guest_page *page;
> + struct guest_page key;
> +
> + *hpa_size = gpa_size;
> + if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> + key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
> + page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
> + sizeof(struct guest_page), guest_page_addrcmp);
> + if (page) {
> + if (gpa + gpa_size <=
> + page->guest_phys_addr + page->size) {
> + return gpa - page->guest_phys_addr +
> + page->host_phys_addr;
> + } else if (gpa < page->guest_phys_addr +
> + page->size) {
> + *hpa_size = page->guest_phys_addr +
> + page->size - gpa;
> + return gpa - page->guest_phys_addr +
> + page->host_phys_addr;
> + }
> + }
> + } else {
> + for (i = 0; i < dev->nr_guest_pages; i++) {
> + page = &dev->guest_pages[i];
> +
> + if (gpa >= page->guest_phys_addr) {
> + if (gpa + gpa_size <
Should the '<' be '<=' here?
> + page->guest_phys_addr + page->size) {
> + return gpa - page->guest_phys_addr +
> + page->host_phys_addr;
> + } else if (gpa < page->guest_phys_addr +
> + page->size) {
> + *hpa_size = page->guest_phys_addr +
> + page->size - gpa;
> + return gpa - page->guest_phys_addr +
> + page->host_phys_addr;
> + }
> + }
> + }
> + }
> +
> + *hpa_size = 0;
> + return 0;
> +}
> +
> static __rte_always_inline uint64_t
> hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len) { diff --git
> a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index
> 95a0bc19f..124a33a10 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> int error = 0;
> + uint64_t mapped_len;
>
> uint32_t tlen = 0;
> int tvec_idx = 0;
> @@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>
> cpy_len = RTE_MIN(buf_avail, mbuf_avail);
>
> - if (unlikely(cpy_len >= cpy_threshold)) {
> - hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
> - buf_iova + buf_offset, cpy_len);
> + while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> + hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> + buf_iova + buf_offset,
> + cpy_len, &mapped_len);
>
> - if (unlikely(!hpa)) {
> - error = -1;
> - goto out;
> - }
> + if (unlikely(!hpa || mapped_len < cpy_threshold))
> + break;
>
> async_fill_vec(src_iovec + tvec_idx,
> (void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> - mbuf_offset), cpy_len);
> + mbuf_offset), (size_t)mapped_len);
>
> - async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
> + async_fill_vec(dst_iovec + tvec_idx,
> + hpa, (size_t)mapped_len);
>
> - tlen += cpy_len;
> + tlen += (uint32_t)mapped_len;
> + cpy_len -= (uint32_t)mapped_len;
> + mbuf_avail -= (uint32_t)mapped_len;
> + mbuf_offset += (uint32_t)mapped_len;
> + buf_avail -= (uint32_t)mapped_len;
> + buf_offset += (uint32_t)mapped_len;
Will it be ok we just transform the uint64_t to uint32_t here?
What if mapped_len > MAX uint32_t ?
Thanks!
Chenbo
> tvec_idx++;
> - } else {
> + }
> +
> + if (likely(cpy_len)) {
> if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
> rte_memcpy(
> (void *)((uintptr_t)(buf_addr + buf_offset)),
> @@ -1112,10 +1120,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> }
> }
>
> - mbuf_avail -= cpy_len;
> - mbuf_offset += cpy_len;
> - buf_avail -= cpy_len;
> - buf_offset += cpy_len;
> + if (cpy_len) {
> + mbuf_avail -= cpy_len;
> + mbuf_offset += cpy_len;
> + buf_avail -= cpy_len;
> + buf_offset += cpy_len;
> + }
> }
>
> out:
> --
> 2.18.4
Hi,
> -----Original Message-----
> From: Xia, Chenbo <chenbo.xia@intel.com>
> Sent: Monday, July 27, 2020 9:14 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org;
> maxime.coquelin@redhat.com
> Subject: RE: [PATCH v3] vhost: fix async copy fail on multi-page buffers
>
> Hi Patrick,
>
> > -----Original Message-----
> > From: Fu, Patrick <patrick.fu@intel.com>
> > Sent: Monday, July 27, 2020 2:33 PM
> > To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> > <chenbo.xia@intel.com>
> > Cc: Fu, Patrick <patrick.fu@intel.com>
> > Subject: [PATCH v3] vhost: fix async copy fail on multi-page buffers
> >
> > lib/librte_vhost/vhost.h | 50 +++++++++++++++++++++++++++++++++++
> > lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
> > 2 files changed, 75 insertions(+), 15 deletions(-)
> >
> > + } else {
> > + for (i = 0; i < dev->nr_guest_pages; i++) {
> > + page = &dev->guest_pages[i];
> > +
> > + if (gpa >= page->guest_phys_addr) {
> > + if (gpa + gpa_size <
>
> Should the '<' be '<=' here?
>
Yes, it should be. Will update in v4 patch
> > static __rte_always_inline uint64_t
> > hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len) {
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index
> > 95a0bc19f..124a33a10 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> > struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> > struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> > int error = 0;
> > + uint64_t mapped_len;
> >
> > uint32_t tlen = 0;
> > int tvec_idx = 0;
> > @@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev,
> > struct vhost_virtqueue *vq,
> >
> > cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> >
> > - if (unlikely(cpy_len >= cpy_threshold)) {
> > - hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
> > - buf_iova + buf_offset, cpy_len);
> > + while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> > + hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> > + buf_iova + buf_offset,
> > + cpy_len, &mapped_len);
> >
> > - if (unlikely(!hpa)) {
> > - error = -1;
> > - goto out;
> > - }
> > + if (unlikely(!hpa || mapped_len < cpy_threshold))
> > + break;
> >
> > async_fill_vec(src_iovec + tvec_idx,
> > (void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> > - mbuf_offset), cpy_len);
> > + mbuf_offset), (size_t)mapped_len);
> >
> > - async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
> > + async_fill_vec(dst_iovec + tvec_idx,
> > + hpa, (size_t)mapped_len);
> >
> > - tlen += cpy_len;
> > + tlen += (uint32_t)mapped_len;
> > + cpy_len -= (uint32_t)mapped_len;
> > + mbuf_avail -= (uint32_t)mapped_len;
> > + mbuf_offset += (uint32_t)mapped_len;
> > + buf_avail -= (uint32_t)mapped_len;
> > + buf_offset += (uint32_t)mapped_len;
>
> Will it be ok we just transform the uint64_t to uint32_t here?
> What if mapped_len > MAX uint32_t ?
>
"mapped_len" could not exceed "cpy_len" (according to the gpa_to_first_hpa() logic). As cpy_len is uint32, mapped_len is safe to down casting.
Thanks,
Patrick
@@ -616,6 +616,56 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
return 0;
}
+static __rte_always_inline rte_iova_t
+gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
+ uint64_t gpa_size, uint64_t *hpa_size)
+{
+ uint32_t i;
+ struct guest_page *page;
+ struct guest_page key;
+
+ *hpa_size = gpa_size;
+ if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
+ key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
+ page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
+ sizeof(struct guest_page), guest_page_addrcmp);
+ if (page) {
+ if (gpa + gpa_size <=
+ page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ } else if (gpa < page->guest_phys_addr +
+ page->size) {
+ *hpa_size = page->guest_phys_addr +
+ page->size - gpa;
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+ } else {
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ if (gpa >= page->guest_phys_addr) {
+ if (gpa + gpa_size <
+ page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ } else if (gpa < page->guest_phys_addr +
+ page->size) {
+ *hpa_size = page->guest_phys_addr +
+ page->size - gpa;
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+ }
+ }
+
+ *hpa_size = 0;
+ return 0;
+}
+
static __rte_always_inline uint64_t
hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len)
{
@@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
int error = 0;
+ uint64_t mapped_len;
uint32_t tlen = 0;
int tvec_idx = 0;
@@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
cpy_len = RTE_MIN(buf_avail, mbuf_avail);
- if (unlikely(cpy_len >= cpy_threshold)) {
- hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
- buf_iova + buf_offset, cpy_len);
+ while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
+ hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+ buf_iova + buf_offset,
+ cpy_len, &mapped_len);
- if (unlikely(!hpa)) {
- error = -1;
- goto out;
- }
+ if (unlikely(!hpa || mapped_len < cpy_threshold))
+ break;
async_fill_vec(src_iovec + tvec_idx,
(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
- mbuf_offset), cpy_len);
+ mbuf_offset), (size_t)mapped_len);
- async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
+ async_fill_vec(dst_iovec + tvec_idx,
+ hpa, (size_t)mapped_len);
- tlen += cpy_len;
+ tlen += (uint32_t)mapped_len;
+ cpy_len -= (uint32_t)mapped_len;
+ mbuf_avail -= (uint32_t)mapped_len;
+ mbuf_offset += (uint32_t)mapped_len;
+ buf_avail -= (uint32_t)mapped_len;
+ buf_offset += (uint32_t)mapped_len;
tvec_idx++;
- } else {
+ }
+
+ if (likely(cpy_len)) {
if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
rte_memcpy(
(void *)((uintptr_t)(buf_addr + buf_offset)),
@@ -1112,10 +1120,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
}
}
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- buf_avail -= cpy_len;
- buf_offset += cpy_len;
+ if (cpy_len) {
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ buf_avail -= cpy_len;
+ buf_offset += cpy_len;
+ }
}
out: