[dpdk-dev,v8,1/3] eal/x86: run-time dispatch over memcpy

Message ID 1507885309-165144-2-git-send-email-xiaoyun.li@intel.com (mailing list archive)
State Accepted, archived
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues

Commit Message

Li, Xiaoyun Oct. 13, 2017, 9:01 a.m. UTC
  This patch dynamically selects functions of memcpy at run-time based
on CPU flags that current machine supports. This patch uses function
pointers which are bind to the relative functions at constrctor time.
In addition, AVX512 instructions set would be compiled only if users
config it enabled and the compiler supports it.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 lib/librte_eal/bsdapp/eal/Makefile                 |  18 +
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   1 +
 lib/librte_eal/common/arch/x86/rte_memcpy.c        |  59 ++
 lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c   |  44 +
 .../common/arch/x86/rte_memcpy_avx512f.c           |  44 +
 lib/librte_eal/common/arch/x86/rte_memcpy_sse.c    |  40 +
 .../common/include/arch/x86/rte_memcpy.h           | 861 +-----------------
 .../common/include/arch/x86/rte_memcpy_internal.h  | 966 +++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/Makefile               |  18 +
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   1 +
 mk/rte.cpuflags.mk                                 |  14 +
 11 files changed, 1220 insertions(+), 846 deletions(-)
 create mode 100644 lib/librte_eal/common/arch/x86/rte_memcpy.c
 create mode 100644 lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c
 create mode 100644 lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c
 create mode 100644 lib/librte_eal/common/arch/x86/rte_memcpy_sse.c
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h
  

Comments

Thomas Monjalon Oct. 13, 2017, 9:28 a.m. UTC | #1
13/10/2017 11:01, Xiaoyun Li:
>  lib/librte_eal/common/arch/x86/rte_memcpy.c        |  59 ++
>  lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c   |  44 +
>  .../common/arch/x86/rte_memcpy_avx512f.c           |  44 +
>  lib/librte_eal/common/arch/x86/rte_memcpy_sse.c    |  40 +
>  .../common/include/arch/x86/rte_memcpy.h           | 861 +-----------------
>  .../common/include/arch/x86/rte_memcpy_internal.h  | 966 +++++++++++++++++++++

I think that rte_memcpy_internal.h should not be in the include directory.
Can it be moved to lib/librte_eal/common/arch/ ?

> --- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> @@ -243,6 +243,7 @@ DPDK_17.11 {
>  	rte_eal_iova_mode;
>  	rte_eal_mbuf_default_mempool_ops;
>  	rte_lcore_has_role;
> +	rte_memcpy_ptr;

I don't know what is the consequence of adding this function in the .map
file for architectures where it does not exist?
  
Ananyev, Konstantin Oct. 13, 2017, 10:26 a.m. UTC | #2
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Friday, October 13, 2017 10:29 AM
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over memcpy
> 
> 13/10/2017 11:01, Xiaoyun Li:
> >  lib/librte_eal/common/arch/x86/rte_memcpy.c        |  59 ++
> >  lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c   |  44 +
> >  .../common/arch/x86/rte_memcpy_avx512f.c           |  44 +
> >  lib/librte_eal/common/arch/x86/rte_memcpy_sse.c    |  40 +
> >  .../common/include/arch/x86/rte_memcpy.h           | 861 +-----------------
> >  .../common/include/arch/x86/rte_memcpy_internal.h  | 966 +++++++++++++++++++++
> 
> I think that rte_memcpy_internal.h should not be in the include directory.
> Can it be moved to lib/librte_eal/common/arch/ ?

I am afraid we can't - for size < 128 bytes we still use inline version of memcpy -
to avoid perfomance regression.
So we still need that file to stay in include dir.

> 
> > --- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> > +++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> > @@ -243,6 +243,7 @@ DPDK_17.11 {
> >  	rte_eal_iova_mode;
> >  	rte_eal_mbuf_default_mempool_ops;
> >  	rte_lcore_has_role;
> > +	rte_memcpy_ptr;
> 
> I don't know what is the consequence of adding this function in the .map
> file for architectures where it does not exist?

I don't have arm/ppc box to try...
Though I tried to add unexciting function name into
lib/librte_eal/linuxapp/eal/rte_eal_version.map.
Didn't encounter any problems. 
So my guess - it is harmless.
Konstantin
  
Thomas Monjalon Oct. 17, 2017, 9:24 p.m. UTC | #3
Hi,

13/10/2017 11:01, Xiaoyun Li:
> This patch dynamically selects functions of memcpy at run-time based
> on CPU flags that current machine supports. This patch uses function
> pointers which are bind to the relative functions at constrctor time.
> In addition, AVX512 instructions set would be compiled only if users
> config it enabled and the compiler supports it.
> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> ---
Keeping only the major changes of the patch for later discussions:
[...]
>  static inline void *
>  rte_memcpy(void *dst, const void *src, size_t n)
>  {
> -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> -		return rte_memcpy_aligned(dst, src, n);
> +	if (n <= RTE_X86_MEMCPY_THRESH)
> +		return rte_memcpy_internal(dst, src, n);
>  	else
> -		return rte_memcpy_generic(dst, src, n);
> +		return (*rte_memcpy_ptr)(dst, src, n);
>  }
[...]
> +static inline void *
> +rte_memcpy_internal(void *dst, const void *src, size_t n)
> +{
> +	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> +		return rte_memcpy_aligned(dst, src, n);
> +	else
> +		return rte_memcpy_generic(dst, src, n);
> +}

The significant change of this patch is to call a function pointer
for packet size > 128 (RTE_X86_MEMCPY_THRESH).

Please could you provide some benchmark numbers?

From a test done at Mellanox, there might be a performance degradation
of about 15% in testpmd txonly with AVX2.
Is there someone else seeing a performance degradation?
  
Li, Xiaoyun Oct. 18, 2017, 2:21 a.m. UTC | #4
Hi

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Wednesday, October 18, 2017 05:24
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> Hi,
> 
> 13/10/2017 11:01, Xiaoyun Li:
> > This patch dynamically selects functions of memcpy at run-time based
> > on CPU flags that current machine supports. This patch uses function
> > pointers which are bind to the relative functions at constrctor time.
> > In addition, AVX512 instructions set would be compiled only if users
> > config it enabled and the compiler supports it.
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > ---
> Keeping only the major changes of the patch for later discussions:
> [...]
> >  static inline void *
> >  rte_memcpy(void *dst, const void *src, size_t n)  {
> > -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > -		return rte_memcpy_aligned(dst, src, n);
> > +	if (n <= RTE_X86_MEMCPY_THRESH)
> > +		return rte_memcpy_internal(dst, src, n);
> >  	else
> > -		return rte_memcpy_generic(dst, src, n);
> > +		return (*rte_memcpy_ptr)(dst, src, n);
> >  }
> [...]
> > +static inline void *
> > +rte_memcpy_internal(void *dst, const void *src, size_t n) {
> > +	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > +		return rte_memcpy_aligned(dst, src, n);
> > +	else
> > +		return rte_memcpy_generic(dst, src, n); }
> 
> The significant change of this patch is to call a function pointer for packet
> size > 128 (RTE_X86_MEMCPY_THRESH). 
The perf drop is due to function call replacing inline.

> Please could you provide some benchmark numbers?
I ran memcpy_perf_test which would show the time cost of memcpy. I ran it on broadwell with sse and avx2.
But I just draw pictures and looked at the trend not computed the exact percentage. Sorry about that.
The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192.
In my test, the size grows, the drop degrades. (Using copy time indicates the perf.)
From the trend picture, when the size is smaller than 128 bytes, the perf drops a lot, almost 50%. And above 128 bytes, it approaches the original dpdk.
I computed it right now, it shows that when greater than 128 bytes and smaller than 1024 bytes, the perf drops about 15%. When above 1024 bytes, the perf drops about 4%.

> From a test done at Mellanox, there might be a performance degradation of
> about 15% in testpmd txonly with AVX2.
> Is there someone else seeing a performance degradation?
  
Li, Xiaoyun Oct. 18, 2017, 6:22 a.m. UTC | #5
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> Sent: Wednesday, October 18, 2017 10:22
> To: Thomas Monjalon <thomas@monjalon.net>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> Hi
> 
> > -----Original Message-----
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Wednesday, October 18, 2017 05:24
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> > Hi,
> >
> > 13/10/2017 11:01, Xiaoyun Li:
> > > This patch dynamically selects functions of memcpy at run-time based
> > > on CPU flags that current machine supports. This patch uses function
> > > pointers which are bind to the relative functions at constrctor time.
> > > In addition, AVX512 instructions set would be compiled only if users
> > > config it enabled and the compiler supports it.
> > >
> > > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > > ---
> > Keeping only the major changes of the patch for later discussions:
> > [...]
> > >  static inline void *
> > >  rte_memcpy(void *dst, const void *src, size_t n)  {
> > > -	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > > -		return rte_memcpy_aligned(dst, src, n);
> > > +	if (n <= RTE_X86_MEMCPY_THRESH)
> > > +		return rte_memcpy_internal(dst, src, n);
> > >  	else
> > > -		return rte_memcpy_generic(dst, src, n);
> > > +		return (*rte_memcpy_ptr)(dst, src, n);
> > >  }
> > [...]
> > > +static inline void *
> > > +rte_memcpy_internal(void *dst, const void *src, size_t n) {
> > > +	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
> > > +		return rte_memcpy_aligned(dst, src, n);
> > > +	else
> > > +		return rte_memcpy_generic(dst, src, n); }
> >
> > The significant change of this patch is to call a function pointer for
> > packet size > 128 (RTE_X86_MEMCPY_THRESH).
> The perf drop is due to function call replacing inline.
> 
> > Please could you provide some benchmark numbers?
> I ran memcpy_perf_test which would show the time cost of memcpy. I ran it
> on broadwell with sse and avx2.
> But I just draw pictures and looked at the trend not computed the exact
> percentage. Sorry about that.
> The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16, 32, 64, 128, 192,
> 256, 320, 384, 448, 512, 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072,
> 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192.
> In my test, the size grows, the drop degrades. (Using copy time indicates the
> perf.) From the trend picture, when the size is smaller than 128 bytes, the
> perf drops a lot, almost 50%. And above 128 bytes, it approaches the original
> dpdk.
> I computed it right now, it shows that when greater than 128 bytes and
> smaller than 1024 bytes, the perf drops about 15%. When above 1024 bytes,
> the perf drops about 4%.
> 
> > From a test done at Mellanox, there might be a performance degradation
> > of about 15% in testpmd txonly with AVX2.

Another thing, I will test testpmd txonly with intel nics and mellanox these days.
And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is any improvement.

> > Is there someone else seeing a performance degradation?
  
Li, Xiaoyun Oct. 19, 2017, 2:45 a.m. UTC | #6
Hi
> > >
> > > The significant change of this patch is to call a function pointer
> > > for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > The perf drop is due to function call replacing inline.
> >
> > > Please could you provide some benchmark numbers?
> > I ran memcpy_perf_test which would show the time cost of memcpy. I ran
> > it on broadwell with sse and avx2.
> > But I just draw pictures and looked at the trend not computed the
> > exact percentage. Sorry about that.
> > The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16, 32,
> > 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024, 1518, 1522, 1536,
> > 1600, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> 7680, 8192.
> > In my test, the size grows, the drop degrades. (Using copy time
> > indicates the
> > perf.) From the trend picture, when the size is smaller than 128
> > bytes, the perf drops a lot, almost 50%. And above 128 bytes, it
> > approaches the original dpdk.
> > I computed it right now, it shows that when greater than 128 bytes and
> > smaller than 1024 bytes, the perf drops about 15%. When above 1024
> > bytes, the perf drops about 4%.
> >
> > > From a test done at Mellanox, there might be a performance
> > > degradation of about 15% in testpmd txonly with AVX2.
> 

I did tests on X710, XXV710, X540 and MT27710 but didn't see performance degradation.

I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n 4 -- -I" and set fwd txonly. 
I tested it on v17.11-rc1, then revert my patch and tested it again.
Show port stats all and see the throughput pps. But the results are similar and no drop.

Did I miss something?

> Another thing, I will test testpmd txonly with intel nics and mellanox these
> days.
> And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is any
> improvement.
> 
> > > Is there someone else seeing a performance degradation?
  
Thomas Monjalon Oct. 19, 2017, 6:58 a.m. UTC | #7
19/10/2017 04:45, Li, Xiaoyun:
> Hi
> > > >
> > > > The significant change of this patch is to call a function pointer
> > > > for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > The perf drop is due to function call replacing inline.
> > >
> > > > Please could you provide some benchmark numbers?
> > > I ran memcpy_perf_test which would show the time cost of memcpy. I ran
> > > it on broadwell with sse and avx2.
> > > But I just draw pictures and looked at the trend not computed the
> > > exact percentage. Sorry about that.
> > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16, 32,
> > > 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024, 1518, 1522, 1536,
> > > 1600, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > 7680, 8192.
> > > In my test, the size grows, the drop degrades. (Using copy time
> > > indicates the
> > > perf.) From the trend picture, when the size is smaller than 128
> > > bytes, the perf drops a lot, almost 50%. And above 128 bytes, it
> > > approaches the original dpdk.
> > > I computed it right now, it shows that when greater than 128 bytes and
> > > smaller than 1024 bytes, the perf drops about 15%. When above 1024
> > > bytes, the perf drops about 4%.
> > >
> > > > From a test done at Mellanox, there might be a performance
> > > > degradation of about 15% in testpmd txonly with AVX2.
> > 
> 
> I did tests on X710, XXV710, X540 and MT27710 but didn't see performance degradation.
> 
> I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n 4 -- -I" and set fwd txonly. 
> I tested it on v17.11-rc1, then revert my patch and tested it again.
> Show port stats all and see the throughput pps. But the results are similar and no drop.
> 
> Did I miss something?

I do not understand. Yesterday you confirmed a 15% drop with buffers between
128 and 1024 bytes.
But you do not see this drop in your txonly tests, right?

> > Another thing, I will test testpmd txonly with intel nics and mellanox these
> > days.
> > And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is any
> > improvement.
> > 
> > > > Is there someone else seeing a performance degradation?
  
Li, Xiaoyun Oct. 19, 2017, 7:51 a.m. UTC | #8
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, October 19, 2017 14:59
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 19/10/2017 04:45, Li, Xiaoyun:
> > Hi
> > > > >
> > > > > The significant change of this patch is to call a function
> > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > The perf drop is due to function call replacing inline.
> > > >
> > > > > Please could you provide some benchmark numbers?
> > > > I ran memcpy_perf_test which would show the time cost of memcpy. I
> > > > ran it on broadwell with sse and avx2.
> > > > But I just draw pictures and looked at the trend not computed the
> > > > exact percentage. Sorry about that.
> > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16,
> > > > 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024, 1518, 1522,
> > > > 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144,
> > > > 6656, 7168,
> > > 7680, 8192.
> > > > In my test, the size grows, the drop degrades. (Using copy time
> > > > indicates the
> > > > perf.) From the trend picture, when the size is smaller than 128
> > > > bytes, the perf drops a lot, almost 50%. And above 128 bytes, it
> > > > approaches the original dpdk.
> > > > I computed it right now, it shows that when greater than 128 bytes
> > > > and smaller than 1024 bytes, the perf drops about 15%. When above
> > > > 1024 bytes, the perf drops about 4%.
> > > >
> > > > > From a test done at Mellanox, there might be a performance
> > > > > degradation of about 15% in testpmd txonly with AVX2.
> > >
> >
> > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> performance degradation.
> >
> > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n 4 -- -
> I" and set fwd txonly.
> > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > Show port stats all and see the throughput pps. But the results are similar
> and no drop.
> >
> > Did I miss something?
> 
> I do not understand. Yesterday you confirmed a 15% drop with buffers
> between
> 128 and 1024 bytes.
> But you do not see this drop in your txonly tests, right?
> 
Yes. The drop is using test.
Using command "make test -j" and then " ./build/app/test -c f -n 4 " 
Then run "memcpy_perf_autotest"
The results are the cycles that memory copy costs.
But I just use it to show the trend because I heard that it's not recommended to use micro benchmarks like test_memcpy_perf for memcpy performance report as they aren't likely able to reflect performance of real world applications.
Details can be seen at https://software.intel.com/en-us/articles/performance-optimization-of-memcpy-in-dpdk

And I didn't see drop in testpmd txonly test. Maybe it's because not a lot memcpy calls.

> > > Another thing, I will test testpmd txonly with intel nics and
> > > mellanox these days.
> > > And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is any
> > > improvement.
> > >
> > > > > Is there someone else seeing a performance degradation?
> 
>
  
Thomas Monjalon Oct. 19, 2017, 8:33 a.m. UTC | #9
19/10/2017 09:51, Li, Xiaoyun:
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > 19/10/2017 04:45, Li, Xiaoyun:
> > > Hi
> > > > > >
> > > > > > The significant change of this patch is to call a function
> > > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > > The perf drop is due to function call replacing inline.
> > > > >
> > > > > > Please could you provide some benchmark numbers?
> > > > > I ran memcpy_perf_test which would show the time cost of memcpy. I
> > > > > ran it on broadwell with sse and avx2.
> > > > > But I just draw pictures and looked at the trend not computed the
> > > > > exact percentage. Sorry about that.
> > > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12, 16,
> > > > > 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024, 1518, 1522,
> > > > > 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144,
> > > > > 6656, 7168,
> > > > 7680, 8192.
> > > > > In my test, the size grows, the drop degrades. (Using copy time
> > > > > indicates the
> > > > > perf.) From the trend picture, when the size is smaller than 128
> > > > > bytes, the perf drops a lot, almost 50%. And above 128 bytes, it
> > > > > approaches the original dpdk.
> > > > > I computed it right now, it shows that when greater than 128 bytes
> > > > > and smaller than 1024 bytes, the perf drops about 15%. When above
> > > > > 1024 bytes, the perf drops about 4%.
> > > > >
> > > > > > From a test done at Mellanox, there might be a performance
> > > > > > degradation of about 15% in testpmd txonly with AVX2.
> > > >
> > >
> > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > performance degradation.
> > >
> > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n 4 -- -
> > I" and set fwd txonly.
> > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > Show port stats all and see the throughput pps. But the results are similar
> > and no drop.
> > >
> > > Did I miss something?
> > 
> > I do not understand. Yesterday you confirmed a 15% drop with buffers
> > between
> > 128 and 1024 bytes.
> > But you do not see this drop in your txonly tests, right?
> > 
> Yes. The drop is using test.
> Using command "make test -j" and then " ./build/app/test -c f -n 4 " 
> Then run "memcpy_perf_autotest"
> The results are the cycles that memory copy costs.
> But I just use it to show the trend because I heard that it's not recommended to use micro benchmarks like test_memcpy_perf for memcpy performance report as they aren't likely able to reflect performance of real world applications.

Yes real applications can hide the memcpy cost.
Sometimes, the cost appear for real :)

> Details can be seen at https://software.intel.com/en-us/articles/performance-optimization-of-memcpy-in-dpdk
> 
> And I didn't see drop in testpmd txonly test. Maybe it's because not a lot memcpy calls.

It has been seen in a mlx4 use-case using more memcpy.
I think 15% in micro-benchmark is too much.
What can we do? Raise the threshold?

> > > > Another thing, I will test testpmd txonly with intel nics and
> > > > mellanox these days.
> > > > And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is any
> > > > improvement.
> > > >
> > > > > > Is there someone else seeing a performance degradation?
  
Li, Xiaoyun Oct. 19, 2017, 8:50 a.m. UTC | #10
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, October 19, 2017 16:34
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 19/10/2017 09:51, Li, Xiaoyun:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > Hi
> > > > > > >
> > > > > > > The significant change of this patch is to call a function
> > > > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > > > The perf drop is due to function call replacing inline.
> > > > > >
> > > > > > > Please could you provide some benchmark numbers?
> > > > > > I ran memcpy_perf_test which would show the time cost of
> > > > > > memcpy. I ran it on broadwell with sse and avx2.
> > > > > > But I just draw pictures and looked at the trend not computed
> > > > > > the exact percentage. Sorry about that.
> > > > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12,
> > > > > > 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024,
> > > > > > 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608,
> > > > > > 5120, 5632, 6144, 6656, 7168,
> > > > > 7680, 8192.
> > > > > > In my test, the size grows, the drop degrades. (Using copy
> > > > > > time indicates the
> > > > > > perf.) From the trend picture, when the size is smaller than
> > > > > > 128 bytes, the perf drops a lot, almost 50%. And above 128
> > > > > > bytes, it approaches the original dpdk.
> > > > > > I computed it right now, it shows that when greater than 128
> > > > > > bytes and smaller than 1024 bytes, the perf drops about 15%.
> > > > > > When above
> > > > > > 1024 bytes, the perf drops about 4%.
> > > > > >
> > > > > > > From a test done at Mellanox, there might be a performance
> > > > > > > degradation of about 15% in testpmd txonly with AVX2.
> > > > >
> > > >
> > > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > > performance degradation.
> > > >
> > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n
> > > > 4 -- -
> > > I" and set fwd txonly.
> > > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > > Show port stats all and see the throughput pps. But the results
> > > > are similar
> > > and no drop.
> > > >
> > > > Did I miss something?
> > >
> > > I do not understand. Yesterday you confirmed a 15% drop with buffers
> > > between
> > > 128 and 1024 bytes.
> > > But you do not see this drop in your txonly tests, right?
> > >
> > Yes. The drop is using test.
> > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > Then run "memcpy_perf_autotest"
> > The results are the cycles that memory copy costs.
> > But I just use it to show the trend because I heard that it's not
> recommended to use micro benchmarks like test_memcpy_perf for memcpy
> performance report as they aren't likely able to reflect performance of real
> world applications.
> 
> Yes real applications can hide the memcpy cost.
> Sometimes, the cost appear for real :)
> 
> > Details can be seen at
> > https://software.intel.com/en-us/articles/performance-optimization-of-
> > memcpy-in-dpdk
> >
> > And I didn't see drop in testpmd txonly test. Maybe it's because not a lot
> memcpy calls.
> 
> It has been seen in a mlx4 use-case using more memcpy.
> I think 15% in micro-benchmark is too much.
> What can we do? Raise the threshold?
> 
I think so. If there is big drop, can try raise the threshold. Maybe 1024? but not sure.
But I didn't reproduce the 15% drop on mellanox and not sure how to verify it.

> > > > > Another thing, I will test testpmd txonly with intel nics and
> > > > > mellanox these days.
> > > > > And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is
> > > > > any improvement.
> > > > >
> > > > > > > Is there someone else seeing a performance degradation?
  
Ananyev, Konstantin Oct. 19, 2017, 8:59 a.m. UTC | #11
> -----Original Message-----
> From: Li, Xiaoyun
> Sent: Thursday, October 19, 2017 9:51 AM
> To: Thomas Monjalon <thomas@monjalon.net>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over memcpy
> 
> 
> 
> > -----Original Message-----
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Thursday, October 19, 2017 16:34
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > ophirmu@mellanox.com
> > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> > 19/10/2017 09:51, Li, Xiaoyun:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > Hi
> > > > > > > >
> > > > > > > > The significant change of this patch is to call a function
> > > > > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > > > > The perf drop is due to function call replacing inline.
> > > > > > >
> > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > I ran memcpy_perf_test which would show the time cost of
> > > > > > > memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > But I just draw pictures and looked at the trend not computed
> > > > > > > the exact percentage. Sorry about that.
> > > > > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12,
> > > > > > > 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024,
> > > > > > > 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608,
> > > > > > > 5120, 5632, 6144, 6656, 7168,
> > > > > > 7680, 8192.
> > > > > > > In my test, the size grows, the drop degrades. (Using copy
> > > > > > > time indicates the
> > > > > > > perf.) From the trend picture, when the size is smaller than
> > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above 128
> > > > > > > bytes, it approaches the original dpdk.
> > > > > > > I computed it right now, it shows that when greater than 128
> > > > > > > bytes and smaller than 1024 bytes, the perf drops about 15%.
> > > > > > > When above
> > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > >
> > > > > > > > From a test done at Mellanox, there might be a performance
> > > > > > > > degradation of about 15% in testpmd txonly with AVX2.
> > > > > >
> > > > >
> > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > > > performance degradation.
> > > > >
> > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n
> > > > > 4 -- -
> > > > I" and set fwd txonly.
> > > > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > > > Show port stats all and see the throughput pps. But the results
> > > > > are similar
> > > > and no drop.
> > > > >
> > > > > Did I miss something?
> > > >
> > > > I do not understand. Yesterday you confirmed a 15% drop with buffers
> > > > between
> > > > 128 and 1024 bytes.
> > > > But you do not see this drop in your txonly tests, right?
> > > >
> > > Yes. The drop is using test.
> > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > Then run "memcpy_perf_autotest"
> > > The results are the cycles that memory copy costs.
> > > But I just use it to show the trend because I heard that it's not
> > recommended to use micro benchmarks like test_memcpy_perf for memcpy
> > performance report as they aren't likely able to reflect performance of real
> > world applications.
> >
> > Yes real applications can hide the memcpy cost.
> > Sometimes, the cost appear for real :)
> >
> > > Details can be seen at
> > > https://software.intel.com/en-us/articles/performance-optimization-of-
> > > memcpy-in-dpdk
> > >
> > > And I didn't see drop in testpmd txonly test. Maybe it's because not a lot
> > memcpy calls.
> >
> > It has been seen in a mlx4 use-case using more memcpy.
> > I think 15% in micro-benchmark is too much.
> > What can we do? Raise the threshold?
> >
> I think so. If there is big drop, can try raise the threshold. Maybe 1024? but not sure.
> But I didn't reproduce the 15% drop on mellanox and not sure how to verify it.

Can we make it dynamically adjustable then?
A global variable initialized to some default value or so?
Unless you recon that it would affect performance any further...
Konstantin

> 
> > > > > > Another thing, I will test testpmd txonly with intel nics and
> > > > > > mellanox these days.
> > > > > > And try adjusting the RTE_X86_MEMCPY_THRESH to see if there is
> > > > > > any improvement.
> > > > > >
> > > > > > > > Is there someone else seeing a performance degradation?
  
Thomas Monjalon Oct. 19, 2017, 9 a.m. UTC | #12
19/10/2017 10:50, Li, Xiaoyun:
> 
> > -----Original Message-----
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Thursday, October 19, 2017 16:34
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > ophirmu@mellanox.com
> > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> > 
> > 19/10/2017 09:51, Li, Xiaoyun:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > Hi
> > > > > > > >
> > > > > > > > The significant change of this patch is to call a function
> > > > > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > > > > The perf drop is due to function call replacing inline.
> > > > > > >
> > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > I ran memcpy_perf_test which would show the time cost of
> > > > > > > memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > But I just draw pictures and looked at the trend not computed
> > > > > > > the exact percentage. Sorry about that.
> > > > > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12,
> > > > > > > 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024,
> > > > > > > 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608,
> > > > > > > 5120, 5632, 6144, 6656, 7168,
> > > > > > 7680, 8192.
> > > > > > > In my test, the size grows, the drop degrades. (Using copy
> > > > > > > time indicates the
> > > > > > > perf.) From the trend picture, when the size is smaller than
> > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above 128
> > > > > > > bytes, it approaches the original dpdk.
> > > > > > > I computed it right now, it shows that when greater than 128
> > > > > > > bytes and smaller than 1024 bytes, the perf drops about 15%.
> > > > > > > When above
> > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > >
> > > > > > > > From a test done at Mellanox, there might be a performance
> > > > > > > > degradation of about 15% in testpmd txonly with AVX2.
> > > > > >
> > > > >
> > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > > > performance degradation.
> > > > >
> > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n
> > > > > 4 -- -
> > > > I" and set fwd txonly.
> > > > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > > > Show port stats all and see the throughput pps. But the results
> > > > > are similar
> > > > and no drop.
> > > > >
> > > > > Did I miss something?
> > > >
> > > > I do not understand. Yesterday you confirmed a 15% drop with buffers
> > > > between
> > > > 128 and 1024 bytes.
> > > > But you do not see this drop in your txonly tests, right?
> > > >
> > > Yes. The drop is using test.
> > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > Then run "memcpy_perf_autotest"
> > > The results are the cycles that memory copy costs.
> > > But I just use it to show the trend because I heard that it's not
> > recommended to use micro benchmarks like test_memcpy_perf for memcpy
> > performance report as they aren't likely able to reflect performance of real
> > world applications.
> > 
> > Yes real applications can hide the memcpy cost.
> > Sometimes, the cost appear for real :)
> > 
> > > Details can be seen at
> > > https://software.intel.com/en-us/articles/performance-optimization-of-
> > > memcpy-in-dpdk
> > >
> > > And I didn't see drop in testpmd txonly test. Maybe it's because not a lot
> > memcpy calls.
> > 
> > It has been seen in a mlx4 use-case using more memcpy.
> > I think 15% in micro-benchmark is too much.
> > What can we do? Raise the threshold?
> > 
> I think so. If there is big drop, can try raise the threshold. Maybe 1024? but not sure.
> But I didn't reproduce the 15% drop on mellanox and not sure how to verify it.

I think we should focus on micro-benchmark and find a reasonnable threshold
for a reasonnable drop tradeoff.
  
Bruce Richardson Oct. 19, 2017, 9:29 a.m. UTC | #13
On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> 19/10/2017 10:50, Li, Xiaoyun:
> > 
> > > -----Original Message-----
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Thursday, October 19, 2017 16:34
> > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > > Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > ophirmu@mellanox.com
> > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > > memcpy
> > > 
> > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > Hi
> > > > > > > > >
> > > > > > > > > The significant change of this patch is to call a function
> > > > > > > > > pointer for packet size > 128 (RTE_X86_MEMCPY_THRESH).
> > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > >
> > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > I ran memcpy_perf_test which would show the time cost of
> > > > > > > > memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > But I just draw pictures and looked at the trend not computed
> > > > > > > > the exact percentage. Sorry about that.
> > > > > > > > The picture shows results of copy size of 2, 4, 6, 8, 9, 12,
> > > > > > > > 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768, 1024,
> > > > > > > > 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584, 4096, 4608,
> > > > > > > > 5120, 5632, 6144, 6656, 7168,
> > > > > > > 7680, 8192.
> > > > > > > > In my test, the size grows, the drop degrades. (Using copy
> > > > > > > > time indicates the
> > > > > > > > perf.) From the trend picture, when the size is smaller than
> > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above 128
> > > > > > > > bytes, it approaches the original dpdk.
> > > > > > > > I computed it right now, it shows that when greater than 128
> > > > > > > > bytes and smaller than 1024 bytes, the perf drops about 15%.
> > > > > > > > When above
> > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > >
> > > > > > > > > From a test done at Mellanox, there might be a performance
> > > > > > > > > degradation of about 15% in testpmd txonly with AVX2.
> > > > > > >
> > > > > >
> > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > > > > performance degradation.
> > > > > >
> > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c 0xf -n
> > > > > > 4 -- -
> > > > > I" and set fwd txonly.
> > > > > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > > > > Show port stats all and see the throughput pps. But the results
> > > > > > are similar
> > > > > and no drop.
> > > > > >
> > > > > > Did I miss something?
> > > > >
> > > > > I do not understand. Yesterday you confirmed a 15% drop with buffers
> > > > > between
> > > > > 128 and 1024 bytes.
> > > > > But you do not see this drop in your txonly tests, right?
> > > > >
> > > > Yes. The drop is using test.
> > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > Then run "memcpy_perf_autotest"
> > > > The results are the cycles that memory copy costs.
> > > > But I just use it to show the trend because I heard that it's not
> > > recommended to use micro benchmarks like test_memcpy_perf for memcpy
> > > performance report as they aren't likely able to reflect performance of real
> > > world applications.
> > > 
> > > Yes real applications can hide the memcpy cost.
> > > Sometimes, the cost appear for real :)
> > > 
> > > > Details can be seen at
> > > > https://software.intel.com/en-us/articles/performance-optimization-of-
> > > > memcpy-in-dpdk
> > > >
> > > > And I didn't see drop in testpmd txonly test. Maybe it's because not a lot
> > > memcpy calls.
> > > 
> > > It has been seen in a mlx4 use-case using more memcpy.
> > > I think 15% in micro-benchmark is too much.
> > > What can we do? Raise the threshold?
> > > 
> > I think so. If there is big drop, can try raise the threshold. Maybe 1024? but not sure.
> > But I didn't reproduce the 15% drop on mellanox and not sure how to verify it.
> 
> I think we should focus on micro-benchmark and find a reasonnable threshold
> for a reasonnable drop tradeoff.
>
Sadly, it may not be that simple. What shows best performance for
micro-benchmarks may not show the same effect in a real application.

/Bruce
  
Li, Xiaoyun Oct. 20, 2017, 1:02 a.m. UTC | #14
> -----Original Message-----
> From: Richardson, Bruce
> Sent: Thursday, October 19, 2017 17:30
> To: Thomas Monjalon <thomas@monjalon.net>
> Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > 19/10/2017 10:50, Li, Xiaoyun:
> > >
> > > > -----Original Message-----
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Thursday, October 19, 2017 16:34
> > > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu,
> > > > Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch
> > > > over memcpy
> > > >
> > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > Hi
> > > > > > > > > >
> > > > > > > > > > The significant change of this patch is to call a
> > > > > > > > > > function pointer for packet size > 128
> (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > >
> > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > I ran memcpy_perf_test which would show the time cost of
> > > > > > > > > memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > But I just draw pictures and looked at the trend not
> > > > > > > > > computed the exact percentage. Sorry about that.
> > > > > > > > > The picture shows results of copy size of 2, 4, 6, 8, 9,
> > > > > > > > > 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 768,
> > > > > > > > > 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072, 3584,
> > > > > > > > > 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > > > > > > > 7680, 8192.
> > > > > > > > > In my test, the size grows, the drop degrades. (Using
> > > > > > > > > copy time indicates the
> > > > > > > > > perf.) From the trend picture, when the size is smaller
> > > > > > > > > than
> > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above
> > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > I computed it right now, it shows that when greater than
> > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf drops about
> 15%.
> > > > > > > > > When above
> > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > >
> > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > performance degradation of about 15% in testpmd txonly
> with AVX2.
> > > > > > > >
> > > > > > >
> > > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't see
> > > > > > performance degradation.
> > > > > > >
> > > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd -c
> > > > > > > 0xf -n
> > > > > > > 4 -- -
> > > > > > I" and set fwd txonly.
> > > > > > > I tested it on v17.11-rc1, then revert my patch and tested it again.
> > > > > > > Show port stats all and see the throughput pps. But the
> > > > > > > results are similar
> > > > > > and no drop.
> > > > > > >
> > > > > > > Did I miss something?
> > > > > >
> > > > > > I do not understand. Yesterday you confirmed a 15% drop with
> > > > > > buffers between
> > > > > > 128 and 1024 bytes.
> > > > > > But you do not see this drop in your txonly tests, right?
> > > > > >
> > > > > Yes. The drop is using test.
> > > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > > Then run "memcpy_perf_autotest"
> > > > > The results are the cycles that memory copy costs.
> > > > > But I just use it to show the trend because I heard that it's
> > > > > not
> > > > recommended to use micro benchmarks like test_memcpy_perf for
> > > > memcpy performance report as they aren't likely able to reflect
> > > > performance of real world applications.
> > > >
> > > > Yes real applications can hide the memcpy cost.
> > > > Sometimes, the cost appear for real :)
> > > >
> > > > > Details can be seen at
> > > > > https://software.intel.com/en-us/articles/performance-optimizati
> > > > > on-of-
> > > > > memcpy-in-dpdk
> > > > >
> > > > > And I didn't see drop in testpmd txonly test. Maybe it's because
> > > > > not a lot
> > > > memcpy calls.
> > > >
> > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > I think 15% in micro-benchmark is too much.
> > > > What can we do? Raise the threshold?
> > > >
> > > I think so. If there is big drop, can try raise the threshold. Maybe 1024?
> but not sure.
> > > But I didn't reproduce the 15% drop on mellanox and not sure how to
> verify it.
> >
> > I think we should focus on micro-benchmark and find a reasonnable
> > threshold for a reasonnable drop tradeoff.
> >
> Sadly, it may not be that simple. What shows best performance for micro-
> benchmarks may not show the same effect in a real application.
> 
> /Bruce

Then how to measure the performance?

And I cannot reproduce 15% drop on mellanox.
Could the person who tested 15% drop help to do test again with 1024 threshold and see if there is any improvement?
  
Li, Xiaoyun Oct. 25, 2017, 6:55 a.m. UTC | #15
Hi

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> Sent: Friday, October 20, 2017 09:03
> To: Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org;
> Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 
> 
> > -----Original Message-----
> > From: Richardson, Bruce
> > Sent: Thursday, October 19, 2017 17:30
> > To: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > ophirmu@mellanox.com
> > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > 19/10/2017 10:50, Li, Xiaoyun:
> > > >
> > > > > -----Original Message-----
> > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > Sent: Thursday, October 19, 2017 16:34
> > > > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > > Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org;
> > > > > Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time
> > > > > dispatch over memcpy
> > > > >
> > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > Hi
> > > > > > > > > > >
> > > > > > > > > > > The significant change of this patch is to call a
> > > > > > > > > > > function pointer for packet size > 128
> > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > >
> > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > I ran memcpy_perf_test which would show the time cost
> > > > > > > > > > of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > But I just draw pictures and looked at the trend not
> > > > > > > > > > computed the exact percentage. Sorry about that.
> > > > > > > > > > The picture shows results of copy size of 2, 4, 6, 8,
> > > > > > > > > > 9, 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512,
> > > > > > > > > > 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072,
> > > > > > > > > > 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > > > > > > > > 7680, 8192.
> > > > > > > > > > In my test, the size grows, the drop degrades. (Using
> > > > > > > > > > copy time indicates the
> > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > smaller than
> > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above
> > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > I computed it right now, it shows that when greater
> > > > > > > > > > than
> > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf drops
> > > > > > > > > > about
> > 15%.
> > > > > > > > > > When above
> > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > >
> > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > txonly
> > with AVX2.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't
> > > > > > > > see
> > > > > > > performance degradation.
> > > > > > > >
> > > > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > -c 0xf -n
> > > > > > > > 4 -- -
> > > > > > > I" and set fwd txonly.
> > > > > > > > I tested it on v17.11-rc1, then revert my patch and tested it
> again.
> > > > > > > > Show port stats all and see the throughput pps. But the
> > > > > > > > results are similar
> > > > > > > and no drop.
> > > > > > > >
> > > > > > > > Did I miss something?
> > > > > > >
> > > > > > > I do not understand. Yesterday you confirmed a 15% drop with
> > > > > > > buffers between
> > > > > > > 128 and 1024 bytes.
> > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > >
> > > > > > Yes. The drop is using test.
> > > > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > > > Then run "memcpy_perf_autotest"
> > > > > > The results are the cycles that memory copy costs.
> > > > > > But I just use it to show the trend because I heard that it's
> > > > > > not
> > > > > recommended to use micro benchmarks like test_memcpy_perf for
> > > > > memcpy performance report as they aren't likely able to reflect
> > > > > performance of real world applications.
> > > > >
> > > > > Yes real applications can hide the memcpy cost.
> > > > > Sometimes, the cost appear for real :)
> > > > >
> > > > > > Details can be seen at
> > > > > > https://software.intel.com/en-us/articles/performance-optimiza
> > > > > > ti
> > > > > > on-of-
> > > > > > memcpy-in-dpdk
> > > > > >
> > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > because not a lot
> > > > > memcpy calls.
> > > > >
> > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > I think 15% in micro-benchmark is too much.
> > > > > What can we do? Raise the threshold?
> > > > >
> > > > I think so. If there is big drop, can try raise the threshold. Maybe 1024?
> > but not sure.
> > > > But I didn't reproduce the 15% drop on mellanox and not sure how
> > > > to
> > verify it.
> > >
> > > I think we should focus on micro-benchmark and find a reasonnable
> > > threshold for a reasonnable drop tradeoff.
> > >
> > Sadly, it may not be that simple. What shows best performance for
> > micro- benchmarks may not show the same effect in a real application.
> >
> > /Bruce
> 
> Then how to measure the performance?
> 
> And I cannot reproduce 15% drop on mellanox.
> Could the person who tested 15% drop help to do test again with 1024
> threshold and see if there is any improvement?

As Bruce said, best performance on micro-benchmark may not show the same effect in real applications.
And I cannot reproduce the 15% drop.
And I don't know if raising the threshold can improve the perf or not.
Could the person who tested 15% drop help to do test again with 1024 threshold and see if there is any improvement?

Best Regards
Xiaoyun Li
  
Thomas Monjalon Oct. 25, 2017, 7:25 a.m. UTC | #16
25/10/2017 08:55, Li, Xiaoyun:
> From: Li, Xiaoyun
> > From: Richardson, Bruce
> > > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > > 19/10/2017 10:50, Li, Xiaoyun:
> > > > > From: Thomas Monjalon
> > > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > > Hi
> > > > > > > > > > > >
> > > > > > > > > > > > The significant change of this patch is to call a
> > > > > > > > > > > > function pointer for packet size > 128
> > > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > > >
> > > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > > I ran memcpy_perf_test which would show the time cost
> > > > > > > > > > > of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > > But I just draw pictures and looked at the trend not
> > > > > > > > > > > computed the exact percentage. Sorry about that.
> > > > > > > > > > > The picture shows results of copy size of 2, 4, 6, 8,
> > > > > > > > > > > 9, 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512,
> > > > > > > > > > > 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072,
> > > > > > > > > > > 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > > > > > > > > > 7680, 8192.
> > > > > > > > > > > In my test, the size grows, the drop degrades. (Using
> > > > > > > > > > > copy time indicates the
> > > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > > smaller than
> > > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above
> > > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > > I computed it right now, it shows that when greater
> > > > > > > > > > > than
> > > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf drops
> > > > > > > > > > > about
> > > 15%.
> > > > > > > > > > > When above
> > > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > > >
> > > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > > txonly
> > > with AVX2.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't
> > > > > > > > > see
> > > > > > > > performance degradation.
> > > > > > > > >
> > > > > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > > -c 0xf -n
> > > > > > > > > 4 -- -
> > > > > > > > I" and set fwd txonly.
> > > > > > > > > I tested it on v17.11-rc1, then revert my patch and tested it
> > again.
> > > > > > > > > Show port stats all and see the throughput pps. But the
> > > > > > > > > results are similar
> > > > > > > > and no drop.
> > > > > > > > >
> > > > > > > > > Did I miss something?
> > > > > > > >
> > > > > > > > I do not understand. Yesterday you confirmed a 15% drop with
> > > > > > > > buffers between
> > > > > > > > 128 and 1024 bytes.
> > > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > > >
> > > > > > > Yes. The drop is using test.
> > > > > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > > > > Then run "memcpy_perf_autotest"
> > > > > > > The results are the cycles that memory copy costs.
> > > > > > > But I just use it to show the trend because I heard that it's
> > > > > > > not
> > > > > > recommended to use micro benchmarks like test_memcpy_perf for
> > > > > > memcpy performance report as they aren't likely able to reflect
> > > > > > performance of real world applications.
> > > > > >
> > > > > > Yes real applications can hide the memcpy cost.
> > > > > > Sometimes, the cost appear for real :)
> > > > > >
> > > > > > > Details can be seen at
> > > > > > > https://software.intel.com/en-us/articles/performance-optimiza
> > > > > > > ti
> > > > > > > on-of-
> > > > > > > memcpy-in-dpdk
> > > > > > >
> > > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > > because not a lot
> > > > > > memcpy calls.
> > > > > >
> > > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > > I think 15% in micro-benchmark is too much.
> > > > > > What can we do? Raise the threshold?
> > > > > >
> > > > > I think so. If there is big drop, can try raise the threshold. Maybe 1024?
> > > but not sure.
> > > > > But I didn't reproduce the 15% drop on mellanox and not sure how
> > > > > to
> > > verify it.
> > > >
> > > > I think we should focus on micro-benchmark and find a reasonnable
> > > > threshold for a reasonnable drop tradeoff.
> > > >
> > > Sadly, it may not be that simple. What shows best performance for
> > > micro- benchmarks may not show the same effect in a real application.
> > >
> > > /Bruce
> > 
> > Then how to measure the performance?
> > 
> > And I cannot reproduce 15% drop on mellanox.
> > Could the person who tested 15% drop help to do test again with 1024
> > threshold and see if there is any improvement?
> 
> As Bruce said, best performance on micro-benchmark may not show the same effect in real applications.

Yes real applications may hide the impact.
You keep saying that it is a reason to allow degrading memcpy raw perf.
But can you see better performance with buffers of 256 bytes with
any application thanks to your patch?
I am not sure whether there is a benefit keeping a code which imply
a signicative drop in micro-benchmarks.

> And I cannot reproduce the 15% drop.
> And I don't know if raising the threshold can improve the perf or not.
> Could the person who tested 15% drop help to do test again with 1024 threshold and see if there is any improvement?

We will test a increased threshold today.
  
Ananyev, Konstantin Oct. 25, 2017, 8:50 a.m. UTC | #17
> -----Original Message-----
> From: Li, Xiaoyun
> Sent: Wednesday, October 25, 2017 7:55 AM
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over memcpy
> 
> Hi
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> > Sent: Friday, October 20, 2017 09:03
> > To: Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> > <thomas@monjalon.net>
> > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org;
> > Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> >
> >
> > > -----Original Message-----
> > > From: Richardson, Bruce
> > > Sent: Thursday, October 19, 2017 17:30
> > > To: Thomas Monjalon <thomas@monjalon.net>
> > > Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > ophirmu@mellanox.com
> > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > > memcpy
> > >
> > > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > > 19/10/2017 10:50, Li, Xiaoyun:
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > Sent: Thursday, October 19, 2017 16:34
> > > > > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > > > Richardson, Bruce <bruce.richardson@intel.com>; dev@dpdk.org;
> > > > > > Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > > > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time
> > > > > > dispatch over memcpy
> > > > > >
> > > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > > Hi
> > > > > > > > > > > >
> > > > > > > > > > > > The significant change of this patch is to call a
> > > > > > > > > > > > function pointer for packet size > 128
> > > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > > >
> > > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > > I ran memcpy_perf_test which would show the time cost
> > > > > > > > > > > of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > > But I just draw pictures and looked at the trend not
> > > > > > > > > > > computed the exact percentage. Sorry about that.
> > > > > > > > > > > The picture shows results of copy size of 2, 4, 6, 8,
> > > > > > > > > > > 9, 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512,
> > > > > > > > > > > 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072,
> > > > > > > > > > > 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > > > > > > > > > 7680, 8192.
> > > > > > > > > > > In my test, the size grows, the drop degrades. (Using
> > > > > > > > > > > copy time indicates the
> > > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > > smaller than
> > > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above
> > > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > > I computed it right now, it shows that when greater
> > > > > > > > > > > than
> > > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf drops
> > > > > > > > > > > about
> > > 15%.
> > > > > > > > > > > When above
> > > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > > >
> > > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > > txonly
> > > with AVX2.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't
> > > > > > > > > see
> > > > > > > > performance degradation.
> > > > > > > > >
> > > > > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > > -c 0xf -n
> > > > > > > > > 4 -- -
> > > > > > > > I" and set fwd txonly.
> > > > > > > > > I tested it on v17.11-rc1, then revert my patch and tested it
> > again.
> > > > > > > > > Show port stats all and see the throughput pps. But the
> > > > > > > > > results are similar
> > > > > > > > and no drop.
> > > > > > > > >
> > > > > > > > > Did I miss something?
> > > > > > > >
> > > > > > > > I do not understand. Yesterday you confirmed a 15% drop with
> > > > > > > > buffers between
> > > > > > > > 128 and 1024 bytes.
> > > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > > >
> > > > > > > Yes. The drop is using test.
> > > > > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > > > > Then run "memcpy_perf_autotest"
> > > > > > > The results are the cycles that memory copy costs.
> > > > > > > But I just use it to show the trend because I heard that it's
> > > > > > > not
> > > > > > recommended to use micro benchmarks like test_memcpy_perf for
> > > > > > memcpy performance report as they aren't likely able to reflect
> > > > > > performance of real world applications.
> > > > > >
> > > > > > Yes real applications can hide the memcpy cost.
> > > > > > Sometimes, the cost appear for real :)
> > > > > >
> > > > > > > Details can be seen at
> > > > > > > https://software.intel.com/en-us/articles/performance-optimiza
> > > > > > > ti
> > > > > > > on-of-
> > > > > > > memcpy-in-dpdk
> > > > > > >
> > > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > > because not a lot
> > > > > > memcpy calls.
> > > > > >
> > > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > > I think 15% in micro-benchmark is too much.
> > > > > > What can we do? Raise the threshold?
> > > > > >
> > > > > I think so. If there is big drop, can try raise the threshold. Maybe 1024?
> > > but not sure.
> > > > > But I didn't reproduce the 15% drop on mellanox and not sure how
> > > > > to
> > > verify it.
> > > >
> > > > I think we should focus on micro-benchmark and find a reasonnable
> > > > threshold for a reasonnable drop tradeoff.
> > > >
> > > Sadly, it may not be that simple. What shows best performance for
> > > micro- benchmarks may not show the same effect in a real application.
> > >
> > > /Bruce
> >
> > Then how to measure the performance?
> >
> > And I cannot reproduce 15% drop on mellanox.
> > Could the person who tested 15% drop help to do test again with 1024
> > threshold and see if there is any improvement?
> 
> As Bruce said, best performance on micro-benchmark may not show the same effect in real applications.
> And I cannot reproduce the 15% drop.
> And I don't know if raising the threshold can improve the perf or not.
> Could the person who tested 15% drop help to do test again with 1024 threshold and see if there is any improvement?

As I already asked before - why not to make that threshold dynamic?
Konstantin

> 
> Best Regards
> Xiaoyun Li
> 
>
  
Li, Xiaoyun Oct. 25, 2017, 8:54 a.m. UTC | #18
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, October 25, 2017 16:51
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Thomas Monjalon <thomas@monjalon.net>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 
> 
> > -----Original Message-----
> > From: Li, Xiaoyun
> > Sent: Wednesday, October 25, 2017 7:55 AM
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Thomas Monjalon <thomas@monjalon.net>
> > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org;
> > Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> > Hi
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> > > Sent: Friday, October 20, 2017 09:03
> > > To: Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> > > <thomas@monjalon.net>
> > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch
> > > over memcpy
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Richardson, Bruce
> > > > Sent: Thursday, October 19, 2017 17:30
> > > > To: Thomas Monjalon <thomas@monjalon.net>
> > > > Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > > > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > ophirmu@mellanox.com
> > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch
> > > > over memcpy
> > > >
> > > > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > > > 19/10/2017 10:50, Li, Xiaoyun:
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > Sent: Thursday, October 19, 2017 16:34
> > > > > > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > > > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > > > dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang,
> > > > > > > Helin <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time
> > > > > > > dispatch over memcpy
> > > > > > >
> > > > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > > > Hi
> > > > > > > > > > > > >
> > > > > > > > > > > > > The significant change of this patch is to call
> > > > > > > > > > > > > a function pointer for packet size > 128
> > > > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > > > >
> > > > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > > > I ran memcpy_perf_test which would show the time
> > > > > > > > > > > > cost of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > > > But I just draw pictures and looked at the trend
> > > > > > > > > > > > not computed the exact percentage. Sorry about that.
> > > > > > > > > > > > The picture shows results of copy size of 2, 4, 6,
> > > > > > > > > > > > 8, 9, 12, 16, 32, 64, 128, 192, 256, 320, 384,
> > > > > > > > > > > > 448, 512, 768, 1024, 1518, 1522, 1536, 1600, 2048,
> > > > > > > > > > > > 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144,
> > > > > > > > > > > > 6656, 7168,
> > > > > > > > > > > 7680, 8192.
> > > > > > > > > > > > In my test, the size grows, the drop degrades.
> > > > > > > > > > > > (Using copy time indicates the
> > > > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > > > smaller than
> > > > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And
> > > > > > > > > > > > above
> > > > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > > > I computed it right now, it shows that when
> > > > > > > > > > > > greater than
> > > > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf
> > > > > > > > > > > > drops about
> > > > 15%.
> > > > > > > > > > > > When above
> > > > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > > > >
> > > > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > > > txonly
> > > > with AVX2.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but
> > > > > > > > > > didn't see
> > > > > > > > > performance degradation.
> > > > > > > > > >
> > > > > > > > > > I used command
> > > > > > > > > > "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > > > -c 0xf -n
> > > > > > > > > > 4 -- -
> > > > > > > > > I" and set fwd txonly.
> > > > > > > > > > I tested it on v17.11-rc1, then revert my patch and
> > > > > > > > > > tested it
> > > again.
> > > > > > > > > > Show port stats all and see the throughput pps. But
> > > > > > > > > > the results are similar
> > > > > > > > > and no drop.
> > > > > > > > > >
> > > > > > > > > > Did I miss something?
> > > > > > > > >
> > > > > > > > > I do not understand. Yesterday you confirmed a 15% drop
> > > > > > > > > with buffers between
> > > > > > > > > 128 and 1024 bytes.
> > > > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > > > >
> > > > > > > > Yes. The drop is using test.
> > > > > > > > Using command "make test -j" and then " ./build/app/test -c f -n
> 4 "
> > > > > > > > Then run "memcpy_perf_autotest"
> > > > > > > > The results are the cycles that memory copy costs.
> > > > > > > > But I just use it to show the trend because I heard that
> > > > > > > > it's not
> > > > > > > recommended to use micro benchmarks like test_memcpy_perf
> > > > > > > for memcpy performance report as they aren't likely able to
> > > > > > > reflect performance of real world applications.
> > > > > > >
> > > > > > > Yes real applications can hide the memcpy cost.
> > > > > > > Sometimes, the cost appear for real :)
> > > > > > >
> > > > > > > > Details can be seen at
> > > > > > > > https://software.intel.com/en-us/articles/performance-opti
> > > > > > > > miza
> > > > > > > > ti
> > > > > > > > on-of-
> > > > > > > > memcpy-in-dpdk
> > > > > > > >
> > > > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > > > because not a lot
> > > > > > > memcpy calls.
> > > > > > >
> > > > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > > > I think 15% in micro-benchmark is too much.
> > > > > > > What can we do? Raise the threshold?
> > > > > > >
> > > > > > I think so. If there is big drop, can try raise the threshold. Maybe
> 1024?
> > > > but not sure.
> > > > > > But I didn't reproduce the 15% drop on mellanox and not sure
> > > > > > how to
> > > > verify it.
> > > > >
> > > > > I think we should focus on micro-benchmark and find a
> > > > > reasonnable threshold for a reasonnable drop tradeoff.
> > > > >
> > > > Sadly, it may not be that simple. What shows best performance for
> > > > micro- benchmarks may not show the same effect in a real application.
> > > >
> > > > /Bruce
> > >
> > > Then how to measure the performance?
> > >
> > > And I cannot reproduce 15% drop on mellanox.
> > > Could the person who tested 15% drop help to do test again with 1024
> > > threshold and see if there is any improvement?
> >
> > As Bruce said, best performance on micro-benchmark may not show the
> same effect in real applications.
> > And I cannot reproduce the 15% drop.
> > And I don't know if raising the threshold can improve the perf or not.
> > Could the person who tested 15% drop help to do test again with 1024
> threshold and see if there is any improvement?
> 
> As I already asked before - why not to make that threshold dynamic?
> Konstantin
> 
I want to confirm that raising threshold is useful. Then can make it dynamic and set it very large as default.

> >
> > Best Regards
> > Xiaoyun Li
> >
> >
  
Thomas Monjalon Oct. 25, 2017, 9 a.m. UTC | #19
25/10/2017 10:54, Li, Xiaoyun:
> > > > > > I think we should focus on micro-benchmark and find a
> > > > > > reasonnable threshold for a reasonnable drop tradeoff.
> > > > > >
> > > > > Sadly, it may not be that simple. What shows best performance for
> > > > > micro- benchmarks may not show the same effect in a real application.
> > > > >
> > > > > /Bruce
> > > >
> > > > Then how to measure the performance?
> > > >
> > > > And I cannot reproduce 15% drop on mellanox.
> > > > Could the person who tested 15% drop help to do test again with 1024
> > > > threshold and see if there is any improvement?
> > >
> > > As Bruce said, best performance on micro-benchmark may not show the
> > same effect in real applications.
> > > And I cannot reproduce the 15% drop.
> > > And I don't know if raising the threshold can improve the perf or not.
> > > Could the person who tested 15% drop help to do test again with 1024
> > threshold and see if there is any improvement?
> > 
> > As I already asked before - why not to make that threshold dynamic?
> > Konstantin
> > 
> I want to confirm that raising threshold is useful. Then can make it dynamic and set it very large as default.

You can confirm it with micro-benchmarks.
  
Ananyev, Konstantin Oct. 25, 2017, 9:14 a.m. UTC | #20
> -----Original Message-----
> From: Li, Xiaoyun
> Sent: Wednesday, October 25, 2017 9:54 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>; ophirmu@mellanox.com
> Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over memcpy
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Wednesday, October 25, 2017 16:51
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Thomas Monjalon <thomas@monjalon.net>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > memcpy
> >
> >
> >
> > > -----Original Message-----
> > > From: Li, Xiaoyun
> > > Sent: Wednesday, October 25, 2017 7:55 AM
> > > To: Li, Xiaoyun <xiaoyun.li@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; Thomas Monjalon <thomas@monjalon.net>
> > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org;
> > > Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > Subject: RE: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> > > memcpy
> > >
> > > Hi
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Li, Xiaoyun
> > > > Sent: Friday, October 20, 2017 09:03
> > > > To: Richardson, Bruce <bruce.richardson@intel.com>; Thomas Monjalon
> > > > <thomas@monjalon.net>
> > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang, Helin
> > > > <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch
> > > > over memcpy
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Richardson, Bruce
> > > > > Sent: Thursday, October 19, 2017 17:30
> > > > > To: Thomas Monjalon <thomas@monjalon.net>
> > > > > Cc: Li, Xiaoyun <xiaoyun.li@intel.com>; Ananyev, Konstantin
> > > > > <konstantin.ananyev@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> > > > > <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > > ophirmu@mellanox.com
> > > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch
> > > > > over memcpy
> > > > >
> > > > > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > > > > 19/10/2017 10:50, Li, Xiaoyun:
> > > > > > >
> > > > > > > > -----Original Message-----
> > > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > Sent: Thursday, October 19, 2017 16:34
> > > > > > > > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > > > > > > > Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> > > > > > > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > > > > > > dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Zhang,
> > > > > > > > Helin <helin.zhang@intel.com>; ophirmu@mellanox.com
> > > > > > > > Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time
> > > > > > > > dispatch over memcpy
> > > > > > > >
> > > > > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > > > > Hi
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > The significant change of this patch is to call
> > > > > > > > > > > > > > a function pointer for packet size > 128
> > > > > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > > > > I ran memcpy_perf_test which would show the time
> > > > > > > > > > > > > cost of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > > > > But I just draw pictures and looked at the trend
> > > > > > > > > > > > > not computed the exact percentage. Sorry about that.
> > > > > > > > > > > > > The picture shows results of copy size of 2, 4, 6,
> > > > > > > > > > > > > 8, 9, 12, 16, 32, 64, 128, 192, 256, 320, 384,
> > > > > > > > > > > > > 448, 512, 768, 1024, 1518, 1522, 1536, 1600, 2048,
> > > > > > > > > > > > > 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144,
> > > > > > > > > > > > > 6656, 7168,
> > > > > > > > > > > > 7680, 8192.
> > > > > > > > > > > > > In my test, the size grows, the drop degrades.
> > > > > > > > > > > > > (Using copy time indicates the
> > > > > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > > > > smaller than
> > > > > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And
> > > > > > > > > > > > > above
> > > > > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > > > > I computed it right now, it shows that when
> > > > > > > > > > > > > greater than
> > > > > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf
> > > > > > > > > > > > > drops about
> > > > > 15%.
> > > > > > > > > > > > > When above
> > > > > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > > > > txonly
> > > > > with AVX2.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but
> > > > > > > > > > > didn't see
> > > > > > > > > > performance degradation.
> > > > > > > > > > >
> > > > > > > > > > > I used command
> > > > > > > > > > > "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > > > > -c 0xf -n
> > > > > > > > > > > 4 -- -
> > > > > > > > > > I" and set fwd txonly.
> > > > > > > > > > > I tested it on v17.11-rc1, then revert my patch and
> > > > > > > > > > > tested it
> > > > again.
> > > > > > > > > > > Show port stats all and see the throughput pps. But
> > > > > > > > > > > the results are similar
> > > > > > > > > > and no drop.
> > > > > > > > > > >
> > > > > > > > > > > Did I miss something?
> > > > > > > > > >
> > > > > > > > > > I do not understand. Yesterday you confirmed a 15% drop
> > > > > > > > > > with buffers between
> > > > > > > > > > 128 and 1024 bytes.
> > > > > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > > > > >
> > > > > > > > > Yes. The drop is using test.
> > > > > > > > > Using command "make test -j" and then " ./build/app/test -c f -n
> > 4 "
> > > > > > > > > Then run "memcpy_perf_autotest"
> > > > > > > > > The results are the cycles that memory copy costs.
> > > > > > > > > But I just use it to show the trend because I heard that
> > > > > > > > > it's not
> > > > > > > > recommended to use micro benchmarks like test_memcpy_perf
> > > > > > > > for memcpy performance report as they aren't likely able to
> > > > > > > > reflect performance of real world applications.
> > > > > > > >
> > > > > > > > Yes real applications can hide the memcpy cost.
> > > > > > > > Sometimes, the cost appear for real :)
> > > > > > > >
> > > > > > > > > Details can be seen at
> > > > > > > > > https://software.intel.com/en-us/articles/performance-opti
> > > > > > > > > miza
> > > > > > > > > ti
> > > > > > > > > on-of-
> > > > > > > > > memcpy-in-dpdk
> > > > > > > > >
> > > > > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > > > > because not a lot
> > > > > > > > memcpy calls.
> > > > > > > >
> > > > > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > > > > I think 15% in micro-benchmark is too much.
> > > > > > > > What can we do? Raise the threshold?
> > > > > > > >
> > > > > > > I think so. If there is big drop, can try raise the threshold. Maybe
> > 1024?
> > > > > but not sure.
> > > > > > > But I didn't reproduce the 15% drop on mellanox and not sure
> > > > > > > how to
> > > > > verify it.
> > > > > >
> > > > > > I think we should focus on micro-benchmark and find a
> > > > > > reasonnable threshold for a reasonnable drop tradeoff.
> > > > > >
> > > > > Sadly, it may not be that simple. What shows best performance for
> > > > > micro- benchmarks may not show the same effect in a real application.
> > > > >
> > > > > /Bruce
> > > >
> > > > Then how to measure the performance?
> > > >
> > > > And I cannot reproduce 15% drop on mellanox.
> > > > Could the person who tested 15% drop help to do test again with 1024
> > > > threshold and see if there is any improvement?
> > >
> > > As Bruce said, best performance on micro-benchmark may not show the
> > same effect in real applications.
> > > And I cannot reproduce the 15% drop.
> > > And I don't know if raising the threshold can improve the perf or not.
> > > Could the person who tested 15% drop help to do test again with 1024
> > threshold and see if there is any improvement?
> >
> > As I already asked before - why not to make that threshold dynamic?
> > Konstantin
> >
> I want to confirm that raising threshold is useful. Then can make it dynamic and set it very large as default.

Ok.

> 
> > >
> > > Best Regards
> > > Xiaoyun Li
> > >
> > >
  
Li, Xiaoyun Oct. 25, 2017, 10:32 a.m. UTC | #21
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Wednesday, October 25, 2017 17:00
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> Bruce <bruce.richardson@intel.com>; dev@dpdk.org; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 25/10/2017 10:54, Li, Xiaoyun:
> > > > > > > I think we should focus on micro-benchmark and find a
> > > > > > > reasonnable threshold for a reasonnable drop tradeoff.
> > > > > > >
> > > > > > Sadly, it may not be that simple. What shows best performance
> > > > > > for
> > > > > > micro- benchmarks may not show the same effect in a real
> application.
> > > > > >
> > > > > > /Bruce
> > > > >
> > > > > Then how to measure the performance?
> > > > >
> > > > > And I cannot reproduce 15% drop on mellanox.
> > > > > Could the person who tested 15% drop help to do test again with
> > > > > 1024 threshold and see if there is any improvement?
> > > >
> > > > As Bruce said, best performance on micro-benchmark may not show
> > > > the
> > > same effect in real applications.
> > > > And I cannot reproduce the 15% drop.
> > > > And I don't know if raising the threshold can improve the perf or not.
> > > > Could the person who tested 15% drop help to do test again with
> > > > 1024
> > > threshold and see if there is any improvement?
> > >
> > > As I already asked before - why not to make that threshold dynamic?
> > > Konstantin
> > >
> > I want to confirm that raising threshold is useful. Then can make it dynamic
> and set it very large as default.
> 
> You can confirm it with micro-benchmarks.

I did tests on memcpy_perf_test. Set threshold to 1024.
But when smaller than 1024 bytes, it costs 2~4 cycles more than the original.
Such as original is 10, right now is 12. Then the drop is 2/12=16%.
I don't know this kind of drop matters a lot or not.
And above 1024 bytes, the drop is almost 4% as I said before.

/Xiaoyun
  
Thomas Monjalon Oct. 29, 2017, 8:49 a.m. UTC | #22
25/10/2017 09:25, Thomas Monjalon:
> 25/10/2017 08:55, Li, Xiaoyun:
> > From: Li, Xiaoyun
> > > From: Richardson, Bruce
> > > > On Thu, Oct 19, 2017 at 11:00:33AM +0200, Thomas Monjalon wrote:
> > > > > 19/10/2017 10:50, Li, Xiaoyun:
> > > > > > From: Thomas Monjalon
> > > > > > > 19/10/2017 09:51, Li, Xiaoyun:
> > > > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > > > > 19/10/2017 04:45, Li, Xiaoyun:
> > > > > > > > > > Hi
> > > > > > > > > > > > >
> > > > > > > > > > > > > The significant change of this patch is to call a
> > > > > > > > > > > > > function pointer for packet size > 128
> > > > (RTE_X86_MEMCPY_THRESH).
> > > > > > > > > > > > The perf drop is due to function call replacing inline.
> > > > > > > > > > > >
> > > > > > > > > > > > > Please could you provide some benchmark numbers?
> > > > > > > > > > > > I ran memcpy_perf_test which would show the time cost
> > > > > > > > > > > > of memcpy. I ran it on broadwell with sse and avx2.
> > > > > > > > > > > > But I just draw pictures and looked at the trend not
> > > > > > > > > > > > computed the exact percentage. Sorry about that.
> > > > > > > > > > > > The picture shows results of copy size of 2, 4, 6, 8,
> > > > > > > > > > > > 9, 12, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512,
> > > > > > > > > > > > 768, 1024, 1518, 1522, 1536, 1600, 2048, 2560, 3072,
> > > > > > > > > > > > 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168,
> > > > > > > > > > > 7680, 8192.
> > > > > > > > > > > > In my test, the size grows, the drop degrades. (Using
> > > > > > > > > > > > copy time indicates the
> > > > > > > > > > > > perf.) From the trend picture, when the size is
> > > > > > > > > > > > smaller than
> > > > > > > > > > > > 128 bytes, the perf drops a lot, almost 50%. And above
> > > > > > > > > > > > 128 bytes, it approaches the original dpdk.
> > > > > > > > > > > > I computed it right now, it shows that when greater
> > > > > > > > > > > > than
> > > > > > > > > > > > 128 bytes and smaller than 1024 bytes, the perf drops
> > > > > > > > > > > > about
> > > > 15%.
> > > > > > > > > > > > When above
> > > > > > > > > > > > 1024 bytes, the perf drops about 4%.
> > > > > > > > > > > >
> > > > > > > > > > > > > From a test done at Mellanox, there might be a
> > > > > > > > > > > > > performance degradation of about 15% in testpmd
> > > > > > > > > > > > > txonly
> > > > with AVX2.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I did tests on X710, XXV710, X540 and MT27710 but didn't
> > > > > > > > > > see
> > > > > > > > > performance degradation.
> > > > > > > > > >
> > > > > > > > > > I used command "./x86_64-native-linuxapp-gcc/app/testpmd
> > > > > > > > > > -c 0xf -n
> > > > > > > > > > 4 -- -
> > > > > > > > > I" and set fwd txonly.
> > > > > > > > > > I tested it on v17.11-rc1, then revert my patch and tested it
> > > again.
> > > > > > > > > > Show port stats all and see the throughput pps. But the
> > > > > > > > > > results are similar
> > > > > > > > > and no drop.
> > > > > > > > > >
> > > > > > > > > > Did I miss something?
> > > > > > > > >
> > > > > > > > > I do not understand. Yesterday you confirmed a 15% drop with
> > > > > > > > > buffers between
> > > > > > > > > 128 and 1024 bytes.
> > > > > > > > > But you do not see this drop in your txonly tests, right?
> > > > > > > > >
> > > > > > > > Yes. The drop is using test.
> > > > > > > > Using command "make test -j" and then " ./build/app/test -c f -n 4 "
> > > > > > > > Then run "memcpy_perf_autotest"
> > > > > > > > The results are the cycles that memory copy costs.
> > > > > > > > But I just use it to show the trend because I heard that it's
> > > > > > > > not
> > > > > > > recommended to use micro benchmarks like test_memcpy_perf for
> > > > > > > memcpy performance report as they aren't likely able to reflect
> > > > > > > performance of real world applications.
> > > > > > >
> > > > > > > Yes real applications can hide the memcpy cost.
> > > > > > > Sometimes, the cost appear for real :)
> > > > > > >
> > > > > > > > Details can be seen at
> > > > > > > > https://software.intel.com/en-us/articles/performance-optimiza
> > > > > > > > ti
> > > > > > > > on-of-
> > > > > > > > memcpy-in-dpdk
> > > > > > > >
> > > > > > > > And I didn't see drop in testpmd txonly test. Maybe it's
> > > > > > > > because not a lot
> > > > > > > memcpy calls.
> > > > > > >
> > > > > > > It has been seen in a mlx4 use-case using more memcpy.
> > > > > > > I think 15% in micro-benchmark is too much.
> > > > > > > What can we do? Raise the threshold?
> > > > > > >
> > > > > > I think so. If there is big drop, can try raise the threshold. Maybe 1024?
> > > > but not sure.
> > > > > > But I didn't reproduce the 15% drop on mellanox and not sure how
> > > > > > to
> > > > verify it.
> > > > >
> > > > > I think we should focus on micro-benchmark and find a reasonnable
> > > > > threshold for a reasonnable drop tradeoff.
> > > > >
> > > > Sadly, it may not be that simple. What shows best performance for
> > > > micro- benchmarks may not show the same effect in a real application.
> > > >
> > > > /Bruce
> > > 
> > > Then how to measure the performance?
> > > 
> > > And I cannot reproduce 15% drop on mellanox.
> > > Could the person who tested 15% drop help to do test again with 1024
> > > threshold and see if there is any improvement?
> > 
> > As Bruce said, best performance on micro-benchmark may not show the same effect in real applications.
> 
> Yes real applications may hide the impact.
> You keep saying that it is a reason to allow degrading memcpy raw perf.
> But can you see better performance with buffers of 256 bytes with
> any application thanks to your patch?
> I am not sure whether there is a benefit keeping a code which imply
> a signicative drop in micro-benchmarks.
> 
> > And I cannot reproduce the 15% drop.
> > And I don't know if raising the threshold can improve the perf or not.
> > Could the person who tested 15% drop help to do test again with 1024 threshold and see if there is any improvement?
> 
> We will test a increased threshold today.

Sorry, I forgot to update.

It seems that increasing the threshold from 128 to 1024 has no impact.
We can recover the 15% drop only by reverting the patch.

I don't know what is creating this drop exactly.
When doing different tests on different environments, we do not see this drop.
If nobody else can see such issue, I guess we can ignore it.
  
Zhihong Wang Nov. 2, 2017, 10:22 a.m. UTC | #23
> I don't know what is creating this drop exactly.
> When doing different tests on different environments, we do not see this
> drop.
> If nobody else can see such issue, I guess we can ignore it.

Hi Thomas, Xiaoyun,

With this patch (commit 84cc318424d49372dd2a5fbf3cf84426bf95acce) I see
more than 20% performance drop in vhost loopback test with testpmd
macswap for 256 bytes packets, which means it impacts actual vSwitching
performance.

Suggest we fix it or revert it for this release.

Thanks
Zhihong
  
Thomas Monjalon Nov. 2, 2017, 10:44 a.m. UTC | #24
02/11/2017 11:22, Wang, Zhihong:
> > I don't know what is creating this drop exactly.
> > When doing different tests on different environments, we do not see this
> > drop.
> > If nobody else can see such issue, I guess we can ignore it.
> 
> Hi Thomas, Xiaoyun,
> 
> With this patch (commit 84cc318424d49372dd2a5fbf3cf84426bf95acce) I see
> more than 20% performance drop in vhost loopback test with testpmd
> macswap for 256 bytes packets, which means it impacts actual vSwitching
> performance.
> 
> Suggest we fix it or revert it for this release.

I think we need more numbers to take a decision.
What is the benefit of this patch? In which use-cases?
What are the drawbacks? In which use-cases?

Please, it is a call to test performance with and without this patch
in more environments (CPU, packet size, applications).
  
Li, Xiaoyun Nov. 2, 2017, 10:58 a.m. UTC | #25
> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, November 2, 2017 18:45
> To: Wang, Zhihong <zhihong.wang@intel.com>; Li, Xiaoyun
> <xiaoyun.li@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 02/11/2017 11:22, Wang, Zhihong:
> > > I don't know what is creating this drop exactly.
> > > When doing different tests on different environments, we do not see
> > > this drop.
> > > If nobody else can see such issue, I guess we can ignore it.
> >
> > Hi Thomas, Xiaoyun,
> >
> > With this patch (commit 84cc318424d49372dd2a5fbf3cf84426bf95acce) I
> > see more than 20% performance drop in vhost loopback test with testpmd
> > macswap for 256 bytes packets, which means it impacts actual
> > vSwitching performance.
> >
> > Suggest we fix it or revert it for this release.
> 
> I think we need more numbers to take a decision.
> What is the benefit of this patch? In which use-cases?

 The benefit is that if compile it on a lower platform (such as only supports SSE),
when it run on higher platforms (such as AVX2 or AVX512). It would still can get ISA benefit (AVX2).
User case seems to be that some customers want it in cloud environment and don't want to compile on all platforms.

> What are the drawbacks? In which use-cases?

The drawback is perf drop. So far, see lot of drop in mellanox case and vhost case.

Should I send the revert patch or you revert it directly?

> 
> Please, it is a call to test performance with and without this patch in more
> environments (CPU, packet size, applications).
  
Thomas Monjalon Nov. 2, 2017, 12:15 p.m. UTC | #26
02/11/2017 11:58, Li, Xiaoyun:
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > 02/11/2017 11:22, Wang, Zhihong:
> > > > I don't know what is creating this drop exactly.
> > > > When doing different tests on different environments, we do not see
> > > > this drop.
> > > > If nobody else can see such issue, I guess we can ignore it.
> > >
> > > Hi Thomas, Xiaoyun,
> > >
> > > With this patch (commit 84cc318424d49372dd2a5fbf3cf84426bf95acce) I
> > > see more than 20% performance drop in vhost loopback test with testpmd
> > > macswap for 256 bytes packets, which means it impacts actual
> > > vSwitching performance.
> > >
> > > Suggest we fix it or revert it for this release.
> > 
> > I think we need more numbers to take a decision.
> > What is the benefit of this patch? In which use-cases?
> 
>  The benefit is that if compile it on a lower platform (such as only supports SSE),
> when it run on higher platforms (such as AVX2 or AVX512). It would still can get ISA benefit (AVX2).

Yes, but you don't provide any number here.

> User case seems to be that some customers want it in cloud environment and don't want to compile on all platforms.
> 
> > What are the drawbacks? In which use-cases?
> 
> The drawback is perf drop. So far, see lot of drop in mellanox case and vhost case.
> 
> Should I send the revert patch or you revert it directly?

You should send the revert yourself with some good justifications.
I did not ask some numbers when accepting the patch (my mistake).
Please provide the numbers for the revert.

> > Please, it is a call to test performance with and without this patch in more
> > environments (CPU, packet size, applications).

Who can test it in more environments?
  
Yao, Lei A Nov. 3, 2017, 7:47 a.m. UTC | #27
Hi, Thomas

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Thomas Monjalon
> Sent: Thursday, November 2, 2017 6:45 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Li, Xiaoyun
> <xiaoyun.li@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> ophirmu@mellanox.com
> Subject: Re: [dpdk-dev] [PATCH v8 1/3] eal/x86: run-time dispatch over
> memcpy
> 
> 02/11/2017 11:22, Wang, Zhihong:
> > > I don't know what is creating this drop exactly.
> > > When doing different tests on different environments, we do not see
> this
> > > drop.
> > > If nobody else can see such issue, I guess we can ignore it.
> >
> > Hi Thomas, Xiaoyun,
> >
> > With this patch (commit 84cc318424d49372dd2a5fbf3cf84426bf95acce) I see
> > more than 20% performance drop in vhost loopback test with testpmd
> > macswap for 256 bytes packets, which means it impacts actual vSwitching
> > performance.
> >
> > Suggest we fix it or revert it for this release.
> 
> I think we need more numbers to take a decision.
> What is the benefit of this patch? In which use-cases?
> What are the drawbacks? In which use-cases?
> 
> Please, it is a call to test performance with and without this patch
> in more environments (CPU, packet size, applications).

Following is the performance drop we observe in vhost/virtio loopback performance
with and without this patch
Test application: testpmd
CPU info: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
OS: Ubuntu 16.04

Mergebale Path	
packet size	Performance Drop
64	-1.30%
128	0.81%
158	-19.17%
188	-19.18%
218	-16.29%
230	-16.57%
256	-16.77%
280	-3.07%
300	-3.22%
380	-2.44%
420	-1.65%
512	-0.99%
1024	0.00%
1518	-0.68%
	
Vector Path	
packet size	Performance Drop
64	3.30%
128	7.18%
256	-12.77%
512	-0.98%
1024	0.27%
1518	0.68%
  

Patch

diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 317a75e..02091fb 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -93,6 +93,24 @@  SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_cpuflags.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
 
+# for run-time dispatch of memcpy
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy.c
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_sse.c
+
+# if the compiler supports AVX512, add avx512 file
+ifneq ($(findstring CC_SUPPORT_AVX512F,$(MACHINE_CFLAGS)),)
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx512f.c
+CFLAGS_rte_memcpy_avx512f.o += -mavx512f
+CFLAGS_rte_memcpy_avx512f.o += -DRTE_MACHINE_CPUFLAG_AVX512F
+endif
+
+# if the compiler supports AVX2, add avx2 file
+ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),)
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx2.c
+CFLAGS_rte_memcpy_avx2.o += -mavx2
+CFLAGS_rte_memcpy_avx2.o += -DRTE_MACHINE_CPUFLAG_AVX2
+endif
+
 CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
 
 CFLAGS_eal.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index d19f264..080896f 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -243,6 +243,7 @@  DPDK_17.11 {
 	rte_eal_iova_mode;
 	rte_eal_mbuf_default_mempool_ops;
 	rte_lcore_has_role;
+	rte_memcpy_ptr;
 	rte_pci_get_iommu_class;
 	rte_pci_match;
 
diff --git a/lib/librte_eal/common/arch/x86/rte_memcpy.c b/lib/librte_eal/common/arch/x86/rte_memcpy.c
new file mode 100644
index 0000000..74ae702
--- /dev/null
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy.c
@@ -0,0 +1,59 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_memcpy.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
+
+void *(*rte_memcpy_ptr)(void *dst, const void *src, size_t n) = NULL;
+
+static void __attribute__((constructor))
+rte_memcpy_init(void)
+{
+#ifdef CC_SUPPORT_AVX512F
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
+		rte_memcpy_ptr = rte_memcpy_avx512f;
+		RTE_LOG(DEBUG, EAL, "AVX512 memcpy is using!\n");
+		return;
+	}
+#endif
+#ifdef CC_SUPPORT_AVX2
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
+		rte_memcpy_ptr = rte_memcpy_avx2;
+		RTE_LOG(DEBUG, EAL, "AVX2 memcpy is using!\n");
+		return;
+	}
+#endif
+	rte_memcpy_ptr = rte_memcpy_sse;
+	RTE_LOG(DEBUG, EAL, "Default SSE/AVX memcpy is using!\n");
+}
diff --git a/lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c b/lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c
new file mode 100644
index 0000000..3ad229c
--- /dev/null
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c
@@ -0,0 +1,44 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_memcpy.h>
+
+#ifndef RTE_MACHINE_CPUFLAG_AVX2
+#error RTE_MACHINE_CPUFLAG_AVX2 not defined
+#endif
+
+void *
+rte_memcpy_avx2(void *dst, const void *src, size_t n)
+{
+	return rte_memcpy_internal(dst, src, n);
+}
diff --git a/lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c b/lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c
new file mode 100644
index 0000000..be8d964
--- /dev/null
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c
@@ -0,0 +1,44 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_memcpy.h>
+
+#ifndef RTE_MACHINE_CPUFLAG_AVX512F
+#error RTE_MACHINE_CPUFLAG_AVX512F not defined
+#endif
+
+void *
+rte_memcpy_avx512f(void *dst, const void *src, size_t n)
+{
+	return rte_memcpy_internal(dst, src, n);
+}
diff --git a/lib/librte_eal/common/arch/x86/rte_memcpy_sse.c b/lib/librte_eal/common/arch/x86/rte_memcpy_sse.c
new file mode 100644
index 0000000..55d6b41
--- /dev/null
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_sse.c
@@ -0,0 +1,40 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_memcpy.h>
+
+void *
+rte_memcpy_sse(void *dst, const void *src, size_t n)
+{
+	return rte_memcpy_internal(dst, src, n);
+}
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 74c280c..460dcdb 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -1,7 +1,7 @@ 
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -34,867 +34,36 @@ 
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_
 
-/**
- * @file
- *
- * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <rte_vect.h>
-#include <rte_common.h>
+#include <rte_memcpy_internal.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- * Copy bytes from one location to another. The locations must not overlap.
- *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
- * @param dst
- *   Pointer to the destination of the data.
- * @param src
- *   Pointer to the source data.
- * @param n
- *   Number of bytes to copy.
- * @return
- *   Pointer to the destination data.
- */
-static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
-
-#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define RTE_X86_MEMCPY_THRESH 128
 
-#define ALIGNMENT_MASK 0x3F
+extern void *
+(*rte_memcpy_ptr)(void *dst, const void *src, size_t n);
 
 /**
- * AVX512 implementation below
+ * Different implementations of memcpy.
  */
+extern void*
+rte_memcpy_avx512f(void *dst, const void *src, size_t n);
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)src);
-	_mm_storeu_si128((__m128i *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
+extern void *
+rte_memcpy_avx2(void *dst, const void *src, size_t n);
 
-	ymm0 = _mm256_loadu_si256((const __m256i *)src);
-	_mm256_storeu_si256((__m256i *)dst, ymm0);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	__m512i zmm0;
-
-	zmm0 = _mm512_loadu_si512((const void *)src);
-	_mm512_storeu_si512((void *)dst, zmm0);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
-}
-
-/**
- * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m512i zmm0, zmm1;
-
-	while (n >= 128) {
-		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
-		n -= 128;
-		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
-		src = src + 128;
-		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
-		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
-		dst = dst + 128;
-	}
-}
-
-/**
- * Copy 512-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-
-	while (n >= 512) {
-		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
-		n -= 512;
-		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
-		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
-		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
-		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
-		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
-		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
-		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
-		src = src + 512;
-		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
-		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
-		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
-		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
-		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
-		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
-		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
-		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
-		dst = dst + 512;
-	}
-}
-
-static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
-{
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
-	void *ret = dst;
-	size_t dstofss;
-	size_t bits;
-
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
-	}
-
-	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
-	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				  (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				  (const uint8_t *)src - 32 + n);
-		return ret;
-	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
-		if (n >= 128) {
-			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 128;
-			dst = (uint8_t *)dst + 128;
-		}
-COPY_BLOCK_128_BACK63:
-		if (n > 64) {
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov64((uint8_t *)dst - 64 + n,
-					  (const uint8_t *)src - 64 + n);
-			return ret;
-		}
-		if (n > 0)
-			rte_mov64((uint8_t *)dst - 64 + n,
-					  (const uint8_t *)src - 64 + n);
-		return ret;
-	}
-
-	/**
-	 * Make store aligned when copy size exceeds 512 bytes
-	 */
-	dstofss = ((uintptr_t)dst & 0x3F);
-	if (dstofss > 0) {
-		dstofss = 64 - dstofss;
-		n -= dstofss;
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		src = (const uint8_t *)src + dstofss;
-		dst = (uint8_t *)dst + dstofss;
-	}
-
-	/**
-	 * Copy 512-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
-	bits = n;
-	n = n & 511;
-	bits -= n;
-	src = (const uint8_t *)src + bits;
-	dst = (uint8_t *)dst + bits;
-
-	/**
-	 * Copy 128-byte blocks.
-	 * Use copy block function for better instruction order control,
-	 * which is important when load is unaligned.
-	 */
-	if (n >= 128) {
-		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
-		bits = n;
-		n = n & 127;
-		bits -= n;
-		src = (const uint8_t *)src + bits;
-		dst = (uint8_t *)dst + bits;
-	}
-
-	/**
-	 * Copy whatever left
-	 */
-	goto COPY_BLOCK_128_BACK63;
-}
-
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)src);
-	_mm_storeu_si128((__m128i *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)src);
-	_mm256_storeu_si256((__m256i *)dst, ymm0);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
-
-/**
- * Copy 128-byte blocks from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
-{
-	__m256i ymm0, ymm1, ymm2, ymm3;
-
-	while (n >= 128) {
-		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
-		n -= 128;
-		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
-		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
-		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
-		src = (const uint8_t *)src + 128;
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
-		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
-		dst = (uint8_t *)dst + 128;
-	}
-}
-
-static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
-{
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
-	void *ret = dst;
-	size_t dstofss;
-	size_t bits;
-
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
-	}
-
-	/**
-	 * Fast way when copy size doesn't exceed 256 bytes
-	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-		return ret;
-	}
-	if (n <= 256) {
-		if (n >= 128) {
-			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 128;
-			dst = (uint8_t *)dst + 128;
-		}
-COPY_BLOCK_128_BACK31:
-		if (n >= 64) {
-			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 64;
-			dst = (uint8_t *)dst + 64;
-		}
-		if (n > 32) {
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov32((uint8_t *)dst - 32 + n,
-					(const uint8_t *)src - 32 + n);
-			return ret;
-		}
-		if (n > 0) {
-			rte_mov32((uint8_t *)dst - 32 + n,
-					(const uint8_t *)src - 32 + n);
-		}
-		return ret;
-	}
-
-	/**
-	 * Make store aligned when copy size exceeds 256 bytes
-	 */
-	dstofss = (uintptr_t)dst & 0x1F;
-	if (dstofss > 0) {
-		dstofss = 32 - dstofss;
-		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		src = (const uint8_t *)src + dstofss;
-		dst = (uint8_t *)dst + dstofss;
-	}
-
-	/**
-	 * Copy 128-byte blocks
-	 */
-	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
-	bits = n;
-	n = n & 127;
-	bits -= n;
-	src = (const uint8_t *)src + bits;
-	dst = (uint8_t *)dst + bits;
-
-	/**
-	 * Copy whatever left
-	 */
-	goto COPY_BLOCK_128_BACK31;
-}
-
-#else /* RTE_MACHINE_CPUFLAG */
-
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
-	_mm_storeu_si128((__m128i *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
-
-/**
- * Macro for copying unaligned block from one location to another with constant load offset,
- * 47 bytes leftover maximum,
- * locations should not overlap.
- * Requirements:
- * - Store is aligned
- * - Load offset is <offset>, which must be immediate value within [1, 15]
- * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
- * - <dst>, <src>, <len> must be variables
- * - __m128i <xmm0> ~ <xmm8> must be pre-defined
- */
-#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
-__extension__ ({                                                                                            \
-    int tmp;                                                                                                \
-    while (len >= 128 + 16 - offset) {                                                                      \
-        xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
-        len -= 128;                                                                                         \
-        xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
-        xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
-        xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
-        xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
-        xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
-        xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
-        xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
-        xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
-        src = (const uint8_t *)src + 128;                                                                   \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
-        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
-        dst = (uint8_t *)dst + 128;                                                                         \
-    }                                                                                                       \
-    tmp = len;                                                                                              \
-    len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
-    tmp -= len;                                                                                             \
-    src = (const uint8_t *)src + tmp;                                                                       \
-    dst = (uint8_t *)dst + tmp;                                                                             \
-    if (len >= 32 + 16 - offset) {                                                                          \
-        while (len >= 32 + 16 - offset) {                                                                   \
-            xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
-            len -= 32;                                                                                      \
-            xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
-            xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
-            src = (const uint8_t *)src + 32;                                                                \
-            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
-            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
-            dst = (uint8_t *)dst + 32;                                                                      \
-        }                                                                                                   \
-        tmp = len;                                                                                          \
-        len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
-        tmp -= len;                                                                                         \
-        src = (const uint8_t *)src + tmp;                                                                   \
-        dst = (uint8_t *)dst + tmp;                                                                         \
-    }                                                                                                       \
-})
-
-/**
- * Macro for copying unaligned block from one location to another,
- * 47 bytes leftover maximum,
- * locations should not overlap.
- * Use switch here because the aligning instruction requires immediate value for shift count.
- * Requirements:
- * - Store is aligned
- * - Load offset is <offset>, which must be within [1, 15]
- * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
- * - <dst>, <src>, <len> must be variables
- * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
- */
-#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
-__extension__ ({                                                      \
-    switch (offset) {                                                 \
-    case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
-    case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
-    case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
-    case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
-    case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
-    case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
-    case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
-    case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
-    case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
-    case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
-    case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
-    case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
-    case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
-    case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
-    case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
-    default:;                                                         \
-    }                                                                 \
-})
-
-static inline void *
-rte_memcpy_generic(void *dst, const void *src, size_t n)
-{
-	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
-	void *ret = dst;
-	size_t dstofss;
-	size_t srcofs;
-
-	/**
-	 * Copy less than 16 bytes
-	 */
-	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
-	}
-
-	/**
-	 * Fast way when copy size doesn't exceed 512 bytes
-	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 128) {
-		goto COPY_BLOCK_128_BACK15;
-	}
-	if (n <= 512) {
-		if (n >= 256) {
-			n -= 256;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
-			src = (const uint8_t *)src + 256;
-			dst = (uint8_t *)dst + 256;
-		}
-COPY_BLOCK_255_BACK15:
-		if (n >= 128) {
-			n -= 128;
-			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 128;
-			dst = (uint8_t *)dst + 128;
-		}
-COPY_BLOCK_128_BACK15:
-		if (n >= 64) {
-			n -= 64;
-			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 64;
-			dst = (uint8_t *)dst + 64;
-		}
-COPY_BLOCK_64_BACK15:
-		if (n >= 32) {
-			n -= 32;
-			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-			src = (const uint8_t *)src + 32;
-			dst = (uint8_t *)dst + 32;
-		}
-		if (n > 16) {
-			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-			return ret;
-		}
-		if (n > 0) {
-			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		}
-		return ret;
-	}
-
-	/**
-	 * Make store aligned when copy size exceeds 512 bytes,
-	 * and make sure the first 15 bytes are copied, because
-	 * unaligned copy functions require up to 15 bytes
-	 * backwards access.
-	 */
-	dstofss = (uintptr_t)dst & 0x0F;
-	if (dstofss > 0) {
-		dstofss = 16 - dstofss + 16;
-		n -= dstofss;
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		src = (const uint8_t *)src + dstofss;
-		dst = (uint8_t *)dst + dstofss;
-	}
-	srcofs = ((uintptr_t)src & 0x0F);
-
-	/**
-	 * For aligned copy
-	 */
-	if (srcofs == 0) {
-		/**
-		 * Copy 256-byte blocks
-		 */
-		for (; n >= 256; n -= 256) {
-			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
-			dst = (uint8_t *)dst + 256;
-			src = (const uint8_t *)src + 256;
-		}
-
-		/**
-		 * Copy whatever left
-		 */
-		goto COPY_BLOCK_255_BACK15;
-	}
-
-	/**
-	 * For copy with unaligned load
-	 */
-	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
-
-	/**
-	 * Copy whatever left
-	 */
-	goto COPY_BLOCK_64_BACK15;
-}
-
-#endif /* RTE_MACHINE_CPUFLAG */
-
-static inline void *
-rte_memcpy_aligned(void *dst, const void *src, size_t n)
-{
-	void *ret = dst;
-
-	/* Copy size <= 16 bytes */
-	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dst = *(const uint8_t *)src;
-			src = (const uint8_t *)src + 1;
-			dst = (uint8_t *)dst + 1;
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dst = *(const uint16_t *)src;
-			src = (const uint16_t *)src + 1;
-			dst = (uint16_t *)dst + 1;
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dst = *(const uint32_t *)src;
-			src = (const uint32_t *)src + 1;
-			dst = (uint32_t *)dst + 1;
-		}
-		if (n & 0x08)
-			*(uint64_t *)dst = *(const uint64_t *)src;
-
-		return ret;
-	}
-
-	/* Copy 16 <= size <= 32 bytes */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
-
-		return ret;
-	}
-
-	/* Copy 32 < size <= 64 bytes */
-	if (n <= 64) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov32((uint8_t *)dst - 32 + n,
-				(const uint8_t *)src - 32 + n);
-
-		return ret;
-	}
-
-	/* Copy 64 bytes blocks */
-	for (; n >= 64; n -= 64) {
-		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
-		dst = (uint8_t *)dst + 64;
-		src = (const uint8_t *)src + 64;
-	}
-
-	/* Copy whatever left */
-	rte_mov64((uint8_t *)dst - 64 + n,
-			(const uint8_t *)src - 64 + n);
-
-	return ret;
-}
+extern void *
+rte_memcpy_sse(void *dst, const void *src, size_t n);
 
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
-	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_aligned(dst, src, n);
+	if (n <= RTE_X86_MEMCPY_THRESH)
+		return rte_memcpy_internal(dst, src, n);
 	else
-		return rte_memcpy_generic(dst, src, n);
+		return (*rte_memcpy_ptr)(dst, src, n);
 }
 
 #ifdef __cplusplus
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h
new file mode 100644
index 0000000..63ba628
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h
@@ -0,0 +1,966 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCPY_INTERNAL_X86_64_H_
+#define _RTE_MEMCPY_INTERNAL_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+#include <rte_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+
+#define ALIGNMENT_MASK 0x3F
+
+/**
+ * AVX512 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_loadu_si512((const void *)src);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	rte_mov64(dst + 2 * 64, src + 2 * 64);
+	rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1;
+
+	while (n >= 128) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 128;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		src = src + 128;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		dst = dst + 128;
+	}
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+	while (n >= 512) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 512;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+		src = src + 512;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
+		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
+		dst = dst + 512;
+	}
+}
+
+static inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				  (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				  (const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK63:
+		if (n > 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+			return ret;
+		}
+		if (n > 0)
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes
+	 */
+	dstofss = ((uintptr_t)dst & 0x3F);
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 512-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 511;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy 128-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	if (n >= 128) {
+		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		src = (const uint8_t *)src + bits;
+		dst = (uint8_t *)dst + bits;
+	}
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK63;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+#define ALIGNMENT_MASK 0x1F
+
+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	while (n >= 128) {
+		ymm0 = _mm256_loadu_si256((const __m256i *)
+				((const uint8_t *)src + 0 * 32));
+		n -= 128;
+		ymm1 = _mm256_loadu_si256((const __m256i *)
+				((const uint8_t *)src + 1 * 32));
+		ymm2 = _mm256_loadu_si256((const __m256i *)
+				((const uint8_t *)src + 2 * 32));
+		ymm3 = _mm256_loadu_si256((const __m256i *)
+				((const uint8_t *)src + 3 * 32));
+		src = (const uint8_t *)src + 128;
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
+		dst = (uint8_t *)dst + 128;
+	}
+}
+
+static inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 256 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 256) {
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK31:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+		if (n > 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes
+	 */
+	dstofss = (uintptr_t)dst & 0x1F;
+	if (dstofss > 0) {
+		dstofss = 32 - dstofss;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 127;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK31;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+#define ALIGNMENT_MASK 0x0F
+
+/**
+ * SSE & AVX implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+}
+
+/**
+ * Macro for copying unaligned block from one location to another with constant
+ * load offset, 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be immediate value within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
+ *   are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)(		      \
+__extension__ ({							      \
+	int tmp;							      \
+	while (len >= 128 + 16 - offset) {				      \
+		xmm0 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 0 * 16));	      \
+		len -= 128;						      \
+		xmm1 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 1 * 16));	      \
+		xmm2 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 2 * 16));	      \
+		xmm3 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 3 * 16));	      \
+		xmm4 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 4 * 16));	      \
+		xmm5 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 5 * 16));	      \
+		xmm6 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 6 * 16));	      \
+		xmm7 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 7 * 16));	      \
+		xmm8 = _mm_loadu_si128((const __m128i *)		      \
+			((const uint8_t *)src - offset + 8 * 16));	      \
+		src = (const uint8_t *)src + 128;			      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),        \
+			_mm_alignr_epi8(xmm1, xmm0, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),        \
+			_mm_alignr_epi8(xmm2, xmm1, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16),        \
+			_mm_alignr_epi8(xmm3, xmm2, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16),        \
+			_mm_alignr_epi8(xmm4, xmm3, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16),        \
+			_mm_alignr_epi8(xmm5, xmm4, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16),        \
+			_mm_alignr_epi8(xmm6, xmm5, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16),        \
+			_mm_alignr_epi8(xmm7, xmm6, offset));		      \
+		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16),        \
+			_mm_alignr_epi8(xmm8, xmm7, offset));		      \
+		dst = (uint8_t *)dst + 128;				      \
+	}								      \
+	tmp = len;							      \
+	len = ((len - 16 + offset) & 127) + 16 - offset;		      \
+	tmp -= len;							      \
+	src = (const uint8_t *)src + tmp;				      \
+	dst = (uint8_t *)dst + tmp;					      \
+	if (len >= 32 + 16 - offset) {					      \
+		while (len >= 32 + 16 - offset) {			      \
+			xmm0 = _mm_loadu_si128((const __m128i *)	      \
+				((const uint8_t *)src - offset + 0 * 16));    \
+			len -= 32;					      \
+			xmm1 = _mm_loadu_si128((const __m128i *)	      \
+				((const uint8_t *)src - offset + 1 * 16));    \
+			xmm2 = _mm_loadu_si128((const __m128i *)	      \
+				((const uint8_t *)src - offset + 2 * 16));    \
+			src = (const uint8_t *)src + 32;		      \
+			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),\
+				_mm_alignr_epi8(xmm1, xmm0, offset));	      \
+			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),\
+				_mm_alignr_epi8(xmm2, xmm1, offset));	      \
+			dst = (uint8_t *)dst + 32;			      \
+		}							      \
+		tmp = len;						      \
+		len = ((len - 16 + offset) & 31) + 16 - offset;		      \
+		tmp -= len;						      \
+		src = (const uint8_t *)src + tmp;			      \
+		dst = (uint8_t *)dst + tmp;				      \
+	}								      \
+}))
+
+/**
+ * Macro for copying unaligned block from one location to another,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Use switch here because the aligning instruction requires immediate value
+ * for shift count.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
+ *   are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be
+ *   pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)(			    \
+__extension__ ({							    \
+	switch (offset) {						    \
+	case 0x01:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01);		    \
+		break;							    \
+	case 0x02:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02);		    \
+		break;							    \
+	case 0x03:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03);		    \
+		break;							    \
+	case 0x04:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04);		    \
+		break;							    \
+	case 0x05:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05);		    \
+		break;							    \
+	case 0x06:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06);		    \
+		break;							    \
+	case 0x07:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07);		    \
+		break;							    \
+	case 0x08:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08);		    \
+		break;							    \
+	case 0x09:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09);		    \
+		break;							    \
+	case 0x0A:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A);		    \
+		break;							    \
+	case 0x0B:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B);		    \
+		break;							    \
+	case 0x0C:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C);		    \
+		break;							    \
+	case 0x0D:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D);		    \
+		break;							    \
+	case 0x0E:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E);		    \
+		break;							    \
+	case 0x0F:							    \
+		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F);		    \
+		break;							    \
+	default:							    \
+		break;							    \
+	}								    \
+}))
+
+static inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t srcofs;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 128)
+		goto COPY_BLOCK_128_BACK15;
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov128((uint8_t *)dst + 128,
+					(const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+COPY_BLOCK_255_BACK15:
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK15:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+COPY_BLOCK_64_BACK15:
+		if (n >= 32) {
+			n -= 32;
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 32;
+			dst = (uint8_t *)dst + 32;
+		}
+		if (n > 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov16((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov16((uint8_t *)dst - 16 + n,
+					(const uint8_t *)src - 16 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes,
+	 * and make sure the first 15 bytes are copied, because
+	 * unaligned copy functions require up to 15 bytes
+	 * backwards access.
+	 */
+	dstofss = (uintptr_t)dst & 0x0F;
+	if (dstofss > 0) {
+		dstofss = 16 - dstofss + 16;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+	srcofs = ((uintptr_t)src & 0x0F);
+
+	/**
+	 * For aligned copy
+	 */
+	if (srcofs == 0) {
+		/**
+		 * Copy 256-byte blocks
+		 */
+		for (; n >= 256; n -= 256) {
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			dst = (uint8_t *)dst + 256;
+			src = (const uint8_t *)src + 256;
+		}
+
+		/**
+		 * Copy whatever left
+		 */
+		goto COPY_BLOCK_255_BACK15;
+	}
+
+	/**
+	 * For copy with unaligned load
+	 */
+	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_64_BACK15;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+static inline void *
+rte_memcpy_aligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+
+	/* Copy size <= 16 bytes */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			src = (const uint8_t *)src + 1;
+			dst = (uint8_t *)dst + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			src = (const uint16_t *)src + 1;
+			dst = (uint16_t *)dst + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			src = (const uint32_t *)src + 1;
+			dst = (uint32_t *)dst + 1;
+		}
+		if (n & 0x08)
+			*(uint64_t *)dst = *(const uint64_t *)src;
+
+		return ret;
+	}
+
+	/* Copy 16 <= size <= 32 bytes */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+
+		return ret;
+	}
+
+	/* Copy 32 < size <= 64 bytes */
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+
+		return ret;
+	}
+
+	/* Copy 64 bytes blocks */
+	for (; n >= 64; n -= 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;
+	}
+
+	/* Copy whatever left */
+	rte_mov64((uint8_t *)dst - 64 + n,
+			(const uint8_t *)src - 64 + n);
+
+	return ret;
+}
+
+static inline void *
+rte_memcpy_internal(void *dst, const void *src, size_t n)
+{
+	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+		return rte_memcpy_aligned(dst, src, n);
+	else
+		return rte_memcpy_generic(dst, src, n);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_INTERNAL_X86_64_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 21e0b4a..6ee7f23 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -102,6 +102,24 @@  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
 
+# for run-time dispatch of memcpy
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy.c
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_sse.c
+
+# if the compiler supports AVX512, add avx512 file
+ifneq ($(findstring CC_SUPPORT_AVX512F,$(MACHINE_CFLAGS)),)
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx512f.c
+CFLAGS_rte_memcpy_avx512f.o += -mavx512f
+CFLAGS_rte_memcpy_avx512f.o += -DRTE_MACHINE_CPUFLAG_AVX512F
+endif
+
+# if the compiler supports AVX2, add avx2 file
+ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),)
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx2.c
+CFLAGS_rte_memcpy_avx2.o += -mavx2
+CFLAGS_rte_memcpy_avx2.o += -DRTE_MACHINE_CPUFLAG_AVX2
+endif
+
 CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
 
 CFLAGS_eal.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index fe186cb..c173ccf 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -247,6 +247,7 @@  DPDK_17.11 {
 	rte_eal_iova_mode;
 	rte_eal_mbuf_default_mempool_ops;
 	rte_lcore_has_role;
+	rte_memcpy_ptr;
 	rte_pci_get_iommu_class;
 	rte_pci_match;
 
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index a813c91..8a7a1e7 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -134,6 +134,20 @@  endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
+# Check if the compiler suppoerts AVX512
+CC_SUPPORT_AVX512F := $(shell $(CC) -mavx512f -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
+ifeq ($(CC_SUPPORT_AVX512F),1)
+ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX512F
+endif
+endif
+
+# Check if the compiler supports AVX2
+CC_SUPPORT_AVX2 := $(shell $(CC) -mavx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
+ifeq ($(CC_SUPPORT_AVX2),1)
+MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
+endif
+
 # To strip whitespace
 comma:= ,
 empty:=