[v6,7/7] vhost: optimize memcpy routines when cc memcpy is used
Checks
Commit Message
In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.
This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.
Performance results from a Raptor Lake @ 3,2 GHz:
GCC 12.3.0
64 bytes packets
Core Mode Mpps
E RTE memcpy 9.5
E cc memcpy 9.7
E cc memcpy+pktcpy 9.0
P RTE memcpy 16.4
P cc memcpy 13.5
P cc memcpy+pktcpy 16.2
GCC 12.3.0
1500 bytes packets
Core Mode Mpps
P RTE memcpy 5.8
P cc memcpy 5.9
P cc memcpy+pktcpy 5.9
clang 15.0.7
64 bytes packets
Core Mode Mpps
P RTE memcpy 13.3
P cc memcpy 12.9
P cc memcpy+pktcpy 13.9
"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.
Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)
Comments
On 9/20/24 12:27, Mattias Rönnblom wrote:
> In build where use_cc_memcpy is set to true, the vhost user PMD
> suffers a large performance drop on Intel P-cores for small packets,
> at least when built by GCC and (to a much lesser extent) clang.
>
> This patch addresses that issue by using a custom virtio
> memcpy()-based packet copying routine.
>
> Performance results from a Raptor Lake @ 3,2 GHz:
>
> GCC 12.3.0
> 64 bytes packets
> Core Mode Mpps
> E RTE memcpy 9.5
> E cc memcpy 9.7
> E cc memcpy+pktcpy 9.0
>
> P RTE memcpy 16.4
> P cc memcpy 13.5
> P cc memcpy+pktcpy 16.2
>
> GCC 12.3.0
> 1500 bytes packets
> Core Mode Mpps
> P RTE memcpy 5.8
> P cc memcpy 5.9
> P cc memcpy+pktcpy 5.9
>
> clang 15.0.7
> 64 bytes packets
> Core Mode Mpps
> P RTE memcpy 13.3
> P cc memcpy 12.9
> P cc memcpy+pktcpy 13.9
>
> "RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
> and "pktcpy" is when this patch is applied.
>
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
> lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
> 1 file changed, 35 insertions(+), 2 deletions(-)
>
As the default behaviour remains unchanged, this is good to me:
Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.
> + void *dst = __builtin_assume_aligned(in_dst, 16);
> + const void *src = __builtin_assume_aligned(in_src, 16);
> +
> + if (len <= 256) {
> + size_t left;
> +
> + for (left = len; left >= 32; left -= 32) {
> + memcpy(dst, src, 32);
> + dst = RTE_PTR_ADD(dst, 32);
> + src = RTE_PTR_ADD(src, 32);
> + }
> +
> + memcpy(dst, src, left);
> + } else
> + memcpy(dst, src, len);
> +}
> +#else
> +static __rte_always_inline void
> +pktcpy(void *dst, const void *src, size_t len)
> +{
> + rte_memcpy(dst, src, len);
> +}
> +#endif
> +
> static inline void
> do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
> __rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
> int i;
>
> for (i = 0; i < count; i++) {
> - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> + pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
> elem[i].len);
> PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
> int i;
>
> for (i = 0; i < count; i++)
> - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> + pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>
> vq->batch_copy_nb_elems = 0;
> }
> --
> 2.43.0
Anyway,
Acked-by: Morten Brørup <mb@smartsharesystems.com>
On Fri, 20 Sep 2024 12:27:16 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:
> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
> + void *dst = __builtin_assume_aligned(in_dst, 16);
> + const void *src = __builtin_assume_aligned(in_src, 16);
Not sure if buffer is really aligned that way but x86 doesn't care.
Since src and dst can be pointers into mbuf at an offset.
The offset will be a multiple of the buffer len.
On 2024-10-09 23:25, Morten Brørup wrote:
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
>
> A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.
>
Good point. As I think I mentioned at some point, I'm not sure they are.
From what I recall, having (or pretending) the data is 16-bit aligned
does give a noticeable performance increase on x86_64.
Is this something I should look into for 24.11, or this patch set is not
going to make it anyway?
>> + void *dst = __builtin_assume_aligned(in_dst, 16);
>> + const void *src = __builtin_assume_aligned(in_src, 16);
>> +
>> + if (len <= 256) {
>> + size_t left;
>> +
>> + for (left = len; left >= 32; left -= 32) {
>> + memcpy(dst, src, 32);
>> + dst = RTE_PTR_ADD(dst, 32);
>> + src = RTE_PTR_ADD(src, 32);
>> + }
>> +
>> + memcpy(dst, src, left);
>> + } else
>> + memcpy(dst, src, len);
>> +}
>> +#else
>> +static __rte_always_inline void
>> +pktcpy(void *dst, const void *src, size_t len)
>> +{
>> + rte_memcpy(dst, src, len);
>> +}
>> +#endif
>> +
>> static inline void
>> do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
>> *vq)
>> __rte_shared_locks_required(&vq->iotlb_lock)
>> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
>> vhost_virtqueue *vq)
>> int i;
>>
>> for (i = 0; i < count; i++) {
>> - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> + pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>> vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>> elem[i].len);
>> PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
>> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>> int i;
>>
>> for (i = 0; i < count; i++)
>> - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> + pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>>
>> vq->batch_copy_nb_elems = 0;
>> }
>> --
>> 2.43.0
>
> Anyway,
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>
On 2024-10-09 23:57, Stephen Hemminger wrote:
> On Fri, 20 Sep 2024 12:27:16 +0200
> Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:
>
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
>> + void *dst = __builtin_assume_aligned(in_dst, 16);
>> + const void *src = __builtin_assume_aligned(in_src, 16);
>
> Not sure if buffer is really aligned that way but x86 doesn't care.
>
I think it might care, actually. That's why this makes a difference.
With 16-byte alignment assumed, the compiler may use MOVDQA, otherwise,
it can't and must use MOVDQU. Generally these things doesn't matter from
a performance point of view in my experience, but it this case it did
(in my benchmark, on my CPU, with my compiler etc).
> Since src and dst can be pointers into mbuf at an offset.
> The offset will be a multiple of the buffer len.
@@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
return nr_copies;
}
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+ void *dst = __builtin_assume_aligned(in_dst, 16);
+ const void *src = __builtin_assume_aligned(in_src, 16);
+
+ if (len <= 256) {
+ size_t left;
+
+ for (left = len; left >= 32; left -= 32) {
+ memcpy(dst, src, 32);
+ dst = RTE_PTR_ADD(dst, 32);
+ src = RTE_PTR_ADD(src, 32);
+ }
+
+ memcpy(dst, src, left);
+ } else
+ memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+ rte_memcpy(dst, src, len);
+}
+#endif
+
static inline void
do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
int i;
for (i = 0; i < count; i++) {
- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+ pktcpy(elem[i].dst, elem[i].src, elem[i].len);
vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
elem[i].len);
PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
int i;
for (i = 0; i < count; i++)
- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+ pktcpy(elem[i].dst, elem[i].src, elem[i].len);
vq->batch_copy_nb_elems = 0;
}