[v6,7/7] vhost: optimize memcpy routines when cc memcpy is used

Message ID 20240920102716.738940-8-mattias.ronnblom@ericsson.com (mailing list archive)
State New
Delegated to: Thomas Monjalon
Headers
Series Optionally have rte_memcpy delegate to compiler memcpy |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/github-robot: build success github build: passed
ci/intel-Functional success Functional PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-marvell-Functional success Functional Testing PASS

Commit Message

Mattias Rönnblom Sept. 20, 2024, 10:27 a.m. UTC
In build where use_cc_memcpy is set to true, the vhost user PMD
suffers a large performance drop on Intel P-cores for small packets,
at least when built by GCC and (to a much lesser extent) clang.

This patch addresses that issue by using a custom virtio
memcpy()-based packet copying routine.

Performance results from a Raptor Lake @ 3,2 GHz:

GCC 12.3.0
64 bytes packets
Core  Mode              Mpps
E     RTE memcpy        9.5
E     cc memcpy         9.7
E     cc memcpy+pktcpy  9.0

P     RTE memcpy        16.4
P     cc memcpy         13.5
P     cc memcpy+pktcpy  16.2

GCC 12.3.0
1500 bytes packets
Core  Mode              Mpps
P    RTE memcpy         5.8
P    cc memcpy          5.9
P    cc memcpy+pktcpy   5.9

clang 15.0.7
64 bytes packets
Core  Mode              Mpps
P     RTE memcpy        13.3
P     cc memcpy         12.9
P     cc memcpy+pktcpy  13.9

"RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
and "pktcpy" is when this patch is applied.

Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)
  

Comments

Maxime Coquelin Oct. 3, 2024, 11:46 a.m. UTC | #1
On 9/20/24 12:27, Mattias Rönnblom wrote:
> In build where use_cc_memcpy is set to true, the vhost user PMD
> suffers a large performance drop on Intel P-cores for small packets,
> at least when built by GCC and (to a much lesser extent) clang.
> 
> This patch addresses that issue by using a custom virtio
> memcpy()-based packet copying routine.
> 
> Performance results from a Raptor Lake @ 3,2 GHz:
> 
> GCC 12.3.0
> 64 bytes packets
> Core  Mode              Mpps
> E     RTE memcpy        9.5
> E     cc memcpy         9.7
> E     cc memcpy+pktcpy  9.0
> 
> P     RTE memcpy        16.4
> P     cc memcpy         13.5
> P     cc memcpy+pktcpy  16.2
> 
> GCC 12.3.0
> 1500 bytes packets
> Core  Mode              Mpps
> P    RTE memcpy         5.8
> P    cc memcpy          5.9
> P    cc memcpy+pktcpy   5.9
> 
> clang 15.0.7
> 64 bytes packets
> Core  Mode              Mpps
> P     RTE memcpy        13.3
> P     cc memcpy         12.9
> P     cc memcpy+pktcpy  13.9
> 
> "RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true
> and "pktcpy" is when this patch is applied.
> 
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>   lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++--
>   1 file changed, 35 insertions(+), 2 deletions(-)
> 

As the default behaviour remains unchanged, this is good to me:

Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime
  
Morten Brørup Oct. 9, 2024, 9:25 p.m. UTC | #2
> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{

A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.

> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +	if (len <= 256) {
> +		size_t left;
> +
> +		for (left = len; left >= 32; left -= 32) {
> +			memcpy(dst, src, 32);
> +			dst = RTE_PTR_ADD(dst, 32);
> +			src = RTE_PTR_ADD(src, 32);
> +		}
> +
> +		memcpy(dst, src, left);
> +	} else
> +		memcpy(dst, src, len);
> +}
> +#else
> +static __rte_always_inline void
> +pktcpy(void *dst, const void *src, size_t len)
> +{
> +	rte_memcpy(dst, src, len);
> +}
> +#endif
> +
>  static inline void
>  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
>  	__rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++) {
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>  		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>  					   elem[i].len);
>  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>  	int i;
> 
>  	for (i = 0; i < count; i++)
> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> 
>  	vq->batch_copy_nb_elems = 0;
>  }
> --
> 2.43.0

Anyway,
Acked-by: Morten Brørup <mb@smartsharesystems.com>
  
Stephen Hemminger Oct. 9, 2024, 9:57 p.m. UTC | #3
On Fri, 20 Sep 2024 12:27:16 +0200
Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:

> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{
> +	void *dst = __builtin_assume_aligned(in_dst, 16);
> +	const void *src = __builtin_assume_aligned(in_src, 16);

Not sure if buffer is really aligned that way but x86 doesn't care.

Since src and dst can be pointers into mbuf at an offset.
The offset will be a multiple of the buffer len.
  
Mattias Rönnblom Oct. 10, 2024, 10:29 a.m. UTC | #4
On 2024-10-09 23:25, Morten Brørup wrote:
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
> 
> A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice.
> 

Good point. As I think I mentioned at some point, I'm not sure they are.

 From what I recall, having (or pretending) the data is 16-bit aligned 
does give a noticeable performance increase on x86_64.

Is this something I should look into for 24.11, or this patch set is not 
going to make it anyway?

>> +	void *dst = __builtin_assume_aligned(in_dst, 16);
>> +	const void *src = __builtin_assume_aligned(in_src, 16);
>> +
>> +	if (len <= 256) {
>> +		size_t left;
>> +
>> +		for (left = len; left >= 32; left -= 32) {
>> +			memcpy(dst, src, 32);
>> +			dst = RTE_PTR_ADD(dst, 32);
>> +			src = RTE_PTR_ADD(src, 32);
>> +		}
>> +
>> +		memcpy(dst, src, left);
>> +	} else
>> +		memcpy(dst, src, len);
>> +}
>> +#else
>> +static __rte_always_inline void
>> +pktcpy(void *dst, const void *src, size_t len)
>> +{
>> +	rte_memcpy(dst, src, len);
>> +}
>> +#endif
>> +
>>   static inline void
>>   do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
>> *vq)
>>   	__rte_shared_locks_required(&vq->iotlb_lock)
>> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
>> vhost_virtqueue *vq)
>>   	int i;
>>
>>   	for (i = 0; i < count; i++) {
>> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>>   		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>>   					   elem[i].len);
>>   		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
>> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>>   	int i;
>>
>>   	for (i = 0; i < count; i++)
>> -		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
>> +		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>>
>>   	vq->batch_copy_nb_elems = 0;
>>   }
>> --
>> 2.43.0
> 
> Anyway,
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>
  
Mattias Rönnblom Oct. 10, 2024, 10:35 a.m. UTC | #5
On 2024-10-09 23:57, Stephen Hemminger wrote:
> On Fri, 20 Sep 2024 12:27:16 +0200
> Mattias Rönnblom <mattias.ronnblom@ericsson.com> wrote:
> 
>> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
>> +static __rte_always_inline void
>> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
>> +{
>> +	void *dst = __builtin_assume_aligned(in_dst, 16);
>> +	const void *src = __builtin_assume_aligned(in_src, 16);
> 
> Not sure if buffer is really aligned that way but x86 doesn't care.
> 

I think it might care, actually. That's why this makes a difference. 
With 16-byte alignment assumed, the compiler may use MOVDQA, otherwise, 
it can't and must use MOVDQU. Generally these things doesn't matter from 
a performance point of view in my experience, but it this case it did 
(in my benchmark, on my CPU, with my compiler etc).

> Since src and dst can be pointers into mbuf at an offset.
> The offset will be a multiple of the buffer len.
  

Patch

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 370402d849..63571587a8 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -231,6 +231,39 @@  vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t
 	return nr_copies;
 }
 
+/* The code generated by GCC (and to a lesser extent, clang) with just
+ * a straight memcpy() to copy packets is less than optimal on Intel
+ * P-cores, for small packets. Thus the need of this specialized
+ * memcpy() in builds where use_cc_memcpy is set to true.
+ */
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{
+	void *dst = __builtin_assume_aligned(in_dst, 16);
+	const void *src = __builtin_assume_aligned(in_src, 16);
+
+	if (len <= 256) {
+		size_t left;
+
+		for (left = len; left >= 32; left -= 32) {
+			memcpy(dst, src, 32);
+			dst = RTE_PTR_ADD(dst, 32);
+			src = RTE_PTR_ADD(src, 32);
+		}
+
+		memcpy(dst, src, left);
+	} else
+		memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+	rte_memcpy(dst, src, len);
+}
+#endif
+
 static inline void
 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	__rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++) {
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 		vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
 					   elem[i].len);
 		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@  do_data_copy_dequeue(struct vhost_virtqueue *vq)
 	int i;
 
 	for (i = 0; i < count; i++)
-		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+		pktcpy(elem[i].dst, elem[i].src, elem[i].len);
 
 	vq->batch_copy_nb_elems = 0;
 }