[v2] app/dma-perf: replace pktmbuf with mempool objects

Message ID 20231220110333.619-1-vipin.varghese@amd.com (mailing list archive)
State New
Delegated to: Thomas Monjalon
Headers
Series [v2] app/dma-perf: replace pktmbuf with mempool objects |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/github-robot: build success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS

Commit Message

Vipin Varghese Dec. 20, 2023, 11:03 a.m. UTC
  From: Vipin Varghese <Vipin.Varghese@amd.com>

Replace pktmbuf pool with mempool, this allows increase in MOPS
especially in lower buffer size. Using Mempool, allows to reduce
the extra CPU cycles.

Changes made are
1. pktmbuf pool create with mempool create.
2. create src & dst pointer array from the appropaite numa.
3. use get pool and put for mempool objects.
4. remove pktmbuf_mtod for dma and cpu memcpy.

v2 changes:
 - add ACK from  Morten Brørup

v1 changes:
 - pktmbuf pool create with mempool create.
 - create src & dst pointer array from the appropaite numa.
 - use get pool and put for mempool objects.
 - remove pktmbuf_mtod for dma and cpu memcpy.

Test Results for pktmbuf vs mempool:
====================================

Format: Buffer Size | % AVG cycles | % AVG Gbps

Category-1: HW-DSA
-------------------
  64|-13.11| 14.97
 128|-41.49|  0.41
 256| -1.85|  1.20
 512| -9.38|  8.81
1024|  1.82| -2.00
1518|  0.00| -0.80
2048|  1.03| -0.91
4096|  0.00| -0.35
8192|  0.07| -0.08

Category-2: MEMCPY
-------------------
  64|-12.50|14.14
 128|-40.63|67.26
 256|-38.78|59.35
 512|-30.26|43.36
1024|-21.80|27.04
1518|-16.23|19.33
2048|-14.75|16.81
4096| -9.56|10.01
8192| -3.32| 3.12

Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
---
---
 app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 30 deletions(-)
  

Comments

Chengwen Feng Feb. 26, 2024, 2:05 a.m. UTC | #1
Hi Vipin,

On 2023/12/20 19:03, Vipin Varghese wrote:
> From: Vipin Varghese <Vipin.Varghese@amd.com>
> 
> Replace pktmbuf pool with mempool, this allows increase in MOPS
> especially in lower buffer size. Using Mempool, allows to reduce
> the extra CPU cycles.
> 
> Changes made are
> 1. pktmbuf pool create with mempool create.
> 2. create src & dst pointer array from the appropaite numa.
> 3. use get pool and put for mempool objects.
> 4. remove pktmbuf_mtod for dma and cpu memcpy.
> 
> v2 changes:
>  - add ACK from  Morten Brørup
> 
> v1 changes:
>  - pktmbuf pool create with mempool create.
>  - create src & dst pointer array from the appropaite numa.
>  - use get pool and put for mempool objects.
>  - remove pktmbuf_mtod for dma and cpu memcpy.
> 
> Test Results for pktmbuf vs mempool:
> ====================================
> 
> Format: Buffer Size | % AVG cycles | % AVG Gbps
> 
> Category-1: HW-DSA
> -------------------
>   64|-13.11| 14.97
>  128|-41.49|  0.41
>  256| -1.85|  1.20
>  512| -9.38|  8.81
> 1024|  1.82| -2.00
> 1518|  0.00| -0.80
> 2048|  1.03| -0.91
> 4096|  0.00| -0.35
> 8192|  0.07| -0.08
> 
> Category-2: MEMCPY
> -------------------
>   64|-12.50|14.14
>  128|-40.63|67.26
>  256|-38.78|59.35
>  512|-30.26|43.36
> 1024|-21.80|27.04
> 1518|-16.23|19.33
> 2048|-14.75|16.81
> 4096| -9.56|10.01
> 8192| -3.32| 3.12
> 
> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
> ---
> ---
>  app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>  1 file changed, 44 insertions(+), 30 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 9b1f58c78c..dc6f16cc01 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -43,8 +43,8 @@ struct lcore_params {
>  	uint16_t kick_batch;
>  	uint32_t buf_size;
>  	uint16_t test_secs;
> -	struct rte_mbuf **srcs;
> -	struct rte_mbuf **dsts;
> +	void **srcs;
> +	void **dsts;
>  	volatile struct worker_info worker_info;
>  };
>  
> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>  }
>  
>  static inline void
> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
> +cache_flush_buf(__rte_unused void **array,
>  		__rte_unused uint32_t buf_size,
>  		__rte_unused uint32_t nr_buf)
>  {
>  #ifdef RTE_ARCH_X86_64
>  	char *data;
> -	struct rte_mbuf **srcs = array;
> +	void **srcs = array;
>  	uint32_t i, offset;
>  
>  	for (i = 0; i < nr_buf; i++) {
> -		data = rte_pktmbuf_mtod(srcs[i], char *);
> +		data = (char *) srcs[i];
>  		for (offset = 0; offset < buf_size; offset += 64)
>  			__builtin_ia32_clflush(data + offset);
>  	}
> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>  	const uint32_t nr_buf = para->nr_buf;
>  	const uint16_t kick_batch = para->kick_batch;
>  	const uint32_t buf_size = para->buf_size;
> -	struct rte_mbuf **srcs = para->srcs;
> -	struct rte_mbuf **dsts = para->dsts;
> +	void **srcs = para->srcs;
> +	void **dsts = para->dsts;
>  	uint16_t nr_cpl;
>  	uint64_t async_cnt = 0;
>  	uint32_t i;
> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>  	while (1) {
>  		for (i = 0; i < nr_buf; i++) {
>  dma_copy:
> -			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
> -				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> +			ret = rte_dma_copy(dev_id,
> +					0,
> +					(rte_iova_t) srcs[i],
> +					(rte_iova_t) dsts[i],

should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
but this commit is mainly to eliminate the address convert overload, so we
should prepare IOVA for DMA copy, and VA for memory copy.

I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
then direct use them in do_xxx_mem_copy。

Thanks.

> +					buf_size,
> +					0);
>  			if (unlikely(ret < 0)) {
>  				if (ret == -ENOSPC) {
>  					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>  	volatile struct worker_info *worker_info = &(para->worker_info);
>  	const uint32_t nr_buf = para->nr_buf;
>  	const uint32_t buf_size = para->buf_size;
> -	struct rte_mbuf **srcs = para->srcs;
> -	struct rte_mbuf **dsts = para->dsts;
> +	void **srcs = para->srcs;
> +	void **dsts = para->dsts;
>  	uint32_t i;
>  
>  	worker_info->stop_flag = false;
> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>  
>  	while (1) {
>  		for (i = 0; i < nr_buf; i++) {
> -			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
> -			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
> +			const void *src = (void *) dsts[i];
> +			void *dst = (void *) srcs[i];
>  
>  			/* copy buffer form src to dst */
>  			rte_memcpy(dst, src, (size_t)buf_size);
> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>  }
>  
>  static int
> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> -			struct rte_mbuf ***dsts)
> +setup_memory_env(struct test_configure *cfg, void ***srcs,
> +			void ***dsts)
>  {
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	unsigned int nr_sockets;
> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		return -1;
>  	}
>  
> -	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>  			nr_buf,
> +			buf_size,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> -			cfg->src_numa_node);
> +			NULL,
> +			NULL,
> +			NULL,
> +			NULL,
> +			cfg->src_numa_node,
> +			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>  	if (src_pool == NULL) {
>  		PRINT_ERR("Error with source mempool creation.\n");
>  		return -1;
>  	}
>  
> -	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>  			nr_buf,
> +			buf_size,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> -			cfg->dst_numa_node);
> +			NULL,
> +			NULL,
> +			NULL,
> +			NULL,
> +			cfg->dst_numa_node,
> +			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>  	if (dst_pool == NULL) {
>  		PRINT_ERR("Error with destination mempool creation.\n");
>  		return -1;
>  	}
>  
> -	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>  	if (*srcs == NULL) {
>  		printf("Error: srcs malloc failed.\n");
>  		return -1;
>  	}
>  
> -	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
> +	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>  	if (*dsts == NULL) {
>  		printf("Error: dsts malloc failed.\n");
>  		return -1;
>  	}
>  
> -	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
> -		printf("alloc src mbufs failed.\n");
> +	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
> +		printf("alloc src bufs failed.\n");
>  		return -1;
>  	}
>  
> -	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
> -		printf("alloc dst mbufs failed.\n");
> +	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
> +		printf("alloc dst bufs failed.\n");
>  		return -1;
>  	}
>  
> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint16_t i;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
> -	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	void **srcs = NULL, **dsts = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  out:
>  	/* free mbufs used in the test */
>  	if (srcs != NULL)
> -		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>  	if (dsts != NULL)
> -		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>  
>  	/* free the points for the mbufs */
>  	rte_free(srcs);
>
  
Vipin Varghese Feb. 27, 2024, 9:57 a.m. UTC | #2
On 2/26/2024 7:35 AM, fengchengwen wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Vipin,
>
> On 2023/12/20 19:03, Vipin Varghese wrote:
>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>
>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>> especially in lower buffer size. Using Mempool, allows to reduce
>> the extra CPU cycles.
>>
>> Changes made are
>> 1. pktmbuf pool create with mempool create.
>> 2. create src & dst pointer array from the appropaite numa.
>> 3. use get pool and put for mempool objects.
>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>
>> v2 changes:
>>   - add ACK from  Morten Brørup
>>
>> v1 changes:
>>   - pktmbuf pool create with mempool create.
>>   - create src & dst pointer array from the appropaite numa.
>>   - use get pool and put for mempool objects.
>>   - remove pktmbuf_mtod for dma and cpu memcpy.
>>
>> Test Results for pktmbuf vs mempool:
>> ====================================
>>
>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>
>> Category-1: HW-DSA
>> -------------------
>>    64|-13.11| 14.97
>>   128|-41.49|  0.41
>>   256| -1.85|  1.20
>>   512| -9.38|  8.81
>> 1024|  1.82| -2.00
>> 1518|  0.00| -0.80
>> 2048|  1.03| -0.91
>> 4096|  0.00| -0.35
>> 8192|  0.07| -0.08
>>
>> Category-2: MEMCPY
>> -------------------
>>    64|-12.50|14.14
>>   128|-40.63|67.26
>>   256|-38.78|59.35
>>   512|-30.26|43.36
>> 1024|-21.80|27.04
>> 1518|-16.23|19.33
>> 2048|-14.75|16.81
>> 4096| -9.56|10.01
>> 8192| -3.32| 3.12
>>
>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>> ---
>> ---
>>   app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>   1 file changed, 44 insertions(+), 30 deletions(-)
>>
>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>> index 9b1f58c78c..dc6f16cc01 100644
>> --- a/app/test-dma-perf/benchmark.c
>> +++ b/app/test-dma-perf/benchmark.c
>> @@ -43,8 +43,8 @@ struct lcore_params {
>>        uint16_t kick_batch;
>>        uint32_t buf_size;
>>        uint16_t test_secs;
>> -     struct rte_mbuf **srcs;
>> -     struct rte_mbuf **dsts;
>> +     void **srcs;
>> +     void **dsts;
>>        volatile struct worker_info worker_info;
>>   };
>>
>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>   }
>>
>>   static inline void
>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>> +cache_flush_buf(__rte_unused void **array,
>>                __rte_unused uint32_t buf_size,
>>                __rte_unused uint32_t nr_buf)
>>   {
>>   #ifdef RTE_ARCH_X86_64
>>        char *data;
>> -     struct rte_mbuf **srcs = array;
>> +     void **srcs = array;
>>        uint32_t i, offset;
>>
>>        for (i = 0; i < nr_buf; i++) {
>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>> +             data = (char *) srcs[i];
>>                for (offset = 0; offset < buf_size; offset += 64)
>>                        __builtin_ia32_clflush(data + offset);
>>        }
>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>        const uint32_t nr_buf = para->nr_buf;
>>        const uint16_t kick_batch = para->kick_batch;
>>        const uint32_t buf_size = para->buf_size;
>> -     struct rte_mbuf **srcs = para->srcs;
>> -     struct rte_mbuf **dsts = para->dsts;
>> +     void **srcs = para->srcs;
>> +     void **dsts = para->dsts;
>>        uint16_t nr_cpl;
>>        uint64_t async_cnt = 0;
>>        uint32_t i;
>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>        while (1) {
>>                for (i = 0; i < nr_buf; i++) {
>>   dma_copy:
>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> +                     ret = rte_dma_copy(dev_id,
>> +                                     0,
>> +                                     (rte_iova_t) srcs[i],
>> +                                     (rte_iova_t) dsts[i],
Thank you ChengWen for the suggestion, please find my observations below
> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
> but this commit is mainly to eliminate the address convert overload, so we
> should prepare IOVA for DMA copy, and VA for memory copy.

yes, Ferruh helped me to understand. Please let me look into this and 
share a v3 soon.

>
> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
> then direct use them in do_xxx_mem_copy。

Please help me understand if you are suggesting, in function 
`setup_memory_env` we still keep pkt_mbuf creation.

But when the arrays are created instead of populating them with mbuf, we 
directly call `pktmbuf_mtod` and store the

starting address. Thus in cpu-copy or dma-copy we do not spent time in 
compute. Is this what you mean?


My reasoning for not using pktmbuf is as follows

1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom

2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are 
accounting for extra headroom. which is not efficent

3. dma-perf is targeted for performance and not network function.

4. there is an existing example which makes use pktmbuf and dma calls.


hence I would like to use mempool which also helps per numa with flags.


>
> Thanks.
>
>> +                                     buf_size,
>> +                                     0);
>>                        if (unlikely(ret < 0)) {
>>                                if (ret == -ENOSPC) {
>>                                        do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>        volatile struct worker_info *worker_info = &(para->worker_info);
>>        const uint32_t nr_buf = para->nr_buf;
>>        const uint32_t buf_size = para->buf_size;
>> -     struct rte_mbuf **srcs = para->srcs;
>> -     struct rte_mbuf **dsts = para->dsts;
>> +     void **srcs = para->srcs;
>> +     void **dsts = para->dsts;
>>        uint32_t i;
>>
>>        worker_info->stop_flag = false;
>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>
>>        while (1) {
>>                for (i = 0; i < nr_buf; i++) {
>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>> +                     const void *src = (void *) dsts[i];
>> +                     void *dst = (void *) srcs[i];
>>
>>                        /* copy buffer form src to dst */
>>                        rte_memcpy(dst, src, (size_t)buf_size);
>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>   }
>>
>>   static int
>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>> -                     struct rte_mbuf ***dsts)
>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>> +                     void ***dsts)
>>   {
>>        unsigned int buf_size = cfg->buf_size.cur;
>>        unsigned int nr_sockets;
>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>                return -1;
>>        }
>>
>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>                        nr_buf,
>> +                     buf_size,
>>                        0,
>>                        0,
>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>> -                     cfg->src_numa_node);
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     cfg->src_numa_node,
>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>        if (src_pool == NULL) {
>>                PRINT_ERR("Error with source mempool creation.\n");
>>                return -1;
>>        }
>>
>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>                        nr_buf,
>> +                     buf_size,
>>                        0,
>>                        0,
>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>> -                     cfg->dst_numa_node);
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     NULL,
>> +                     cfg->dst_numa_node,
>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>        if (dst_pool == NULL) {
>>                PRINT_ERR("Error with destination mempool creation.\n");
>>                return -1;
>>        }
>>
>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>        if (*srcs == NULL) {
>>                printf("Error: srcs malloc failed.\n");
>>                return -1;
>>        }
>>
>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>        if (*dsts == NULL) {
>>                printf("Error: dsts malloc failed.\n");
>>                return -1;
>>        }
>>
>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>> -             printf("alloc src mbufs failed.\n");
>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>> +             printf("alloc src bufs failed.\n");
>>                return -1;
>>        }
>>
>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>> -             printf("alloc dst mbufs failed.\n");
>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>> +             printf("alloc dst bufs failed.\n");
>>                return -1;
>>        }
>>
>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>        uint16_t i;
>>        uint32_t offset;
>>        unsigned int lcore_id = 0;
>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>> +     void **srcs = NULL, **dsts = NULL;
>>        struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>        unsigned int buf_size = cfg->buf_size.cur;
>>        uint16_t kick_batch = cfg->kick_batch.cur;
>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>   out:
>>        /* free mbufs used in the test */
>>        if (srcs != NULL)
>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>        if (dsts != NULL)
>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>
>>        /* free the points for the mbufs */
>>        rte_free(srcs);
>>
  
Chengwen Feng Feb. 27, 2024, 12:27 p.m. UTC | #3
Hi Vipin,

On 2024/2/27 17:57, Varghese, Vipin wrote:
> 
> On 2/26/2024 7:35 AM, fengchengwen wrote:
>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>
>>
>> Hi Vipin,
>>
>> On 2023/12/20 19:03, Vipin Varghese wrote:
>>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>>
>>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>>> especially in lower buffer size. Using Mempool, allows to reduce
>>> the extra CPU cycles.
>>>
>>> Changes made are
>>> 1. pktmbuf pool create with mempool create.
>>> 2. create src & dst pointer array from the appropaite numa.
>>> 3. use get pool and put for mempool objects.
>>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>>
>>> v2 changes:
>>>   - add ACK from  Morten Brørup
>>>
>>> v1 changes:
>>>   - pktmbuf pool create with mempool create.
>>>   - create src & dst pointer array from the appropaite numa.
>>>   - use get pool and put for mempool objects.
>>>   - remove pktmbuf_mtod for dma and cpu memcpy.
>>>
>>> Test Results for pktmbuf vs mempool:
>>> ====================================
>>>
>>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>>
>>> Category-1: HW-DSA
>>> -------------------
>>>    64|-13.11| 14.97
>>>   128|-41.49|  0.41
>>>   256| -1.85|  1.20
>>>   512| -9.38|  8.81
>>> 1024|  1.82| -2.00
>>> 1518|  0.00| -0.80
>>> 2048|  1.03| -0.91
>>> 4096|  0.00| -0.35
>>> 8192|  0.07| -0.08
>>>
>>> Category-2: MEMCPY
>>> -------------------
>>>    64|-12.50|14.14
>>>   128|-40.63|67.26
>>>   256|-38.78|59.35
>>>   512|-30.26|43.36
>>> 1024|-21.80|27.04
>>> 1518|-16.23|19.33
>>> 2048|-14.75|16.81
>>> 4096| -9.56|10.01
>>> 8192| -3.32| 3.12
>>>
>>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>>> ---
>>> ---
>>>   app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>>   1 file changed, 44 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>>> index 9b1f58c78c..dc6f16cc01 100644
>>> --- a/app/test-dma-perf/benchmark.c
>>> +++ b/app/test-dma-perf/benchmark.c
>>> @@ -43,8 +43,8 @@ struct lcore_params {
>>>        uint16_t kick_batch;
>>>        uint32_t buf_size;
>>>        uint16_t test_secs;
>>> -     struct rte_mbuf **srcs;
>>> -     struct rte_mbuf **dsts;
>>> +     void **srcs;
>>> +     void **dsts;
>>>        volatile struct worker_info worker_info;
>>>   };
>>>
>>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>>   }
>>>
>>>   static inline void
>>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>>> +cache_flush_buf(__rte_unused void **array,
>>>                __rte_unused uint32_t buf_size,
>>>                __rte_unused uint32_t nr_buf)
>>>   {
>>>   #ifdef RTE_ARCH_X86_64
>>>        char *data;
>>> -     struct rte_mbuf **srcs = array;
>>> +     void **srcs = array;
>>>        uint32_t i, offset;
>>>
>>>        for (i = 0; i < nr_buf; i++) {
>>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>>> +             data = (char *) srcs[i];
>>>                for (offset = 0; offset < buf_size; offset += 64)
>>>                        __builtin_ia32_clflush(data + offset);
>>>        }
>>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>>        const uint32_t nr_buf = para->nr_buf;
>>>        const uint16_t kick_batch = para->kick_batch;
>>>        const uint32_t buf_size = para->buf_size;
>>> -     struct rte_mbuf **srcs = para->srcs;
>>> -     struct rte_mbuf **dsts = para->dsts;
>>> +     void **srcs = para->srcs;
>>> +     void **dsts = para->dsts;
>>>        uint16_t nr_cpl;
>>>        uint64_t async_cnt = 0;
>>>        uint32_t i;
>>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>>        while (1) {
>>>                for (i = 0; i < nr_buf; i++) {
>>>   dma_copy:
>>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>>> +                     ret = rte_dma_copy(dev_id,
>>> +                                     0,
>>> +                                     (rte_iova_t) srcs[i],
>>> +                                     (rte_iova_t) dsts[i],
> Thank you ChengWen for the suggestion, please find my observations below
>> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
>> but this commit is mainly to eliminate the address convert overload, so we
>> should prepare IOVA for DMA copy, and VA for memory copy.
> 
> yes, Ferruh helped me to understand. Please let me look into this and share a v3 soon.
> 
>>
>> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
>> then direct use them in do_xxx_mem_copy。
> 
> Please help me understand if you are suggesting, in function `setup_memory_env` we still keep pkt_mbuf creation.
> 
> But when the arrays are created instead of populating them with mbuf, we directly call `pktmbuf_mtod` and store the
> 
> starting address. Thus in cpu-copy or dma-copy we do not spent time in compute. Is this what you mean?

Yes

> 
> 
> My reasoning for not using pktmbuf is as follows
> 
> 1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom
> 
> 2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are accounting for extra headroom. which is not efficent
> 
> 3. dma-perf is targeted for performance and not network function.
> 
> 4. there is an existing example which makes use pktmbuf and dma calls.
> 
> 
> hence I would like to use mempool which also helps per numa with flags.

What I understand the low performance, mainly due to the CPU can't take DMA device performance,
so the CPU is a bottleneck, when we reduce the tasks of the CPU (just like this commit did), then
the performance is improved.

This commit can test the maximum performance when the CPU and DMA cowork together, so I think we can
add this commit.

pktmbuf is a popular programming entity, and almost all application (including examples) in the DPDK
community are based on pktmbuf.

I think that keeping the use of pktbuf provides a flexibility, someone who want do more operates with
pktmbuf (maybe emulate the real logic) could be easily modified and testing.

Thanks

> 
> 
>>
>> Thanks.
>>
>>> +                                     buf_size,
>>> +                                     0);
>>>                        if (unlikely(ret < 0)) {
>>>                                if (ret == -ENOSPC) {
>>>                                        do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>>        volatile struct worker_info *worker_info = &(para->worker_info);
>>>        const uint32_t nr_buf = para->nr_buf;
>>>        const uint32_t buf_size = para->buf_size;
>>> -     struct rte_mbuf **srcs = para->srcs;
>>> -     struct rte_mbuf **dsts = para->dsts;
>>> +     void **srcs = para->srcs;
>>> +     void **dsts = para->dsts;
>>>        uint32_t i;
>>>
>>>        worker_info->stop_flag = false;
>>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>>
>>>        while (1) {
>>>                for (i = 0; i < nr_buf; i++) {
>>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>>> +                     const void *src = (void *) dsts[i];
>>> +                     void *dst = (void *) srcs[i];
>>>
>>>                        /* copy buffer form src to dst */
>>>                        rte_memcpy(dst, src, (size_t)buf_size);
>>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>>   }
>>>
>>>   static int
>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>> -                     struct rte_mbuf ***dsts)
>>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>>> +                     void ***dsts)
>>>   {
>>>        unsigned int buf_size = cfg->buf_size.cur;
>>>        unsigned int nr_sockets;
>>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>                return -1;
>>>        }
>>>
>>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>>                        nr_buf,
>>> +                     buf_size,
>>>                        0,
>>>                        0,
>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>> -                     cfg->src_numa_node);
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     cfg->src_numa_node,
>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>        if (src_pool == NULL) {
>>>                PRINT_ERR("Error with source mempool creation.\n");
>>>                return -1;
>>>        }
>>>
>>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>>                        nr_buf,
>>> +                     buf_size,
>>>                        0,
>>>                        0,
>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>> -                     cfg->dst_numa_node);
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     NULL,
>>> +                     cfg->dst_numa_node,
>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>        if (dst_pool == NULL) {
>>>                PRINT_ERR("Error with destination mempool creation.\n");
>>>                return -1;
>>>        }
>>>
>>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>>        if (*srcs == NULL) {
>>>                printf("Error: srcs malloc failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>>        if (*dsts == NULL) {
>>>                printf("Error: dsts malloc failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>>> -             printf("alloc src mbufs failed.\n");
>>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>>> +             printf("alloc src bufs failed.\n");
>>>                return -1;
>>>        }
>>>
>>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>> -             printf("alloc dst mbufs failed.\n");
>>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>> +             printf("alloc dst bufs failed.\n");
>>>                return -1;
>>>        }
>>>
>>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>        uint16_t i;
>>>        uint32_t offset;
>>>        unsigned int lcore_id = 0;
>>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>>> +     void **srcs = NULL, **dsts = NULL;
>>>        struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>>        unsigned int buf_size = cfg->buf_size.cur;
>>>        uint16_t kick_batch = cfg->kick_batch.cur;
>>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>   out:
>>>        /* free mbufs used in the test */
>>>        if (srcs != NULL)
>>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>>        if (dsts != NULL)
>>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>>
>>>        /* free the points for the mbufs */
>>>        rte_free(srcs);
>>>
> .
  
Vipin Varghese Feb. 28, 2024, 3:08 a.m. UTC | #4
Unaddressed
On 2/27/2024 5:57 PM, fengchengwen wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Vipin,
>
> On 2024/2/27 17:57, Varghese, Vipin wrote:
>> On 2/26/2024 7:35 AM, fengchengwen wrote:
>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>
>>>
>>> Hi Vipin,
>>>
>>> On 2023/12/20 19:03, Vipin Varghese wrote:
>>>> From: Vipin Varghese <Vipin.Varghese@amd.com>
>>>>
>>>> Replace pktmbuf pool with mempool, this allows increase in MOPS
>>>> especially in lower buffer size. Using Mempool, allows to reduce
>>>> the extra CPU cycles.
>>>>
>>>> Changes made are
>>>> 1. pktmbuf pool create with mempool create.
>>>> 2. create src & dst pointer array from the appropaite numa.
>>>> 3. use get pool and put for mempool objects.
>>>> 4. remove pktmbuf_mtod for dma and cpu memcpy.
>>>>
>>>> v2 changes:
>>>>    - add ACK from  Morten Brørup
>>>>
>>>> v1 changes:
>>>>    - pktmbuf pool create with mempool create.
>>>>    - create src & dst pointer array from the appropaite numa.
>>>>    - use get pool and put for mempool objects.
>>>>    - remove pktmbuf_mtod for dma and cpu memcpy.
>>>>
>>>> Test Results for pktmbuf vs mempool:
>>>> ====================================
>>>>
>>>> Format: Buffer Size | % AVG cycles | % AVG Gbps
>>>>
>>>> Category-1: HW-DSA
>>>> -------------------
>>>>     64|-13.11| 14.97
>>>>    128|-41.49|  0.41
>>>>    256| -1.85|  1.20
>>>>    512| -9.38|  8.81
>>>> 1024|  1.82| -2.00
>>>> 1518|  0.00| -0.80
>>>> 2048|  1.03| -0.91
>>>> 4096|  0.00| -0.35
>>>> 8192|  0.07| -0.08
>>>>
>>>> Category-2: MEMCPY
>>>> -------------------
>>>>     64|-12.50|14.14
>>>>    128|-40.63|67.26
>>>>    256|-38.78|59.35
>>>>    512|-30.26|43.36
>>>> 1024|-21.80|27.04
>>>> 1518|-16.23|19.33
>>>> 2048|-14.75|16.81
>>>> 4096| -9.56|10.01
>>>> 8192| -3.32| 3.12
>>>>
>>>> Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
>>>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>>>> Tested-by: Thiyagrajan P <thiyagarajan.p@amd.com>
>>>> ---
>>>> ---
>>>>    app/test-dma-perf/benchmark.c | 74 +++++++++++++++++++++--------------
>>>>    1 file changed, 44 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
>>>> index 9b1f58c78c..dc6f16cc01 100644
>>>> --- a/app/test-dma-perf/benchmark.c
>>>> +++ b/app/test-dma-perf/benchmark.c
>>>> @@ -43,8 +43,8 @@ struct lcore_params {
>>>>         uint16_t kick_batch;
>>>>         uint32_t buf_size;
>>>>         uint16_t test_secs;
>>>> -     struct rte_mbuf **srcs;
>>>> -     struct rte_mbuf **dsts;
>>>> +     void **srcs;
>>>> +     void **dsts;
>>>>         volatile struct worker_info worker_info;
>>>>    };
>>>>
>>>> @@ -110,17 +110,17 @@ output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
>>>>    }
>>>>
>>>>    static inline void
>>>> -cache_flush_buf(__rte_unused struct rte_mbuf **array,
>>>> +cache_flush_buf(__rte_unused void **array,
>>>>                 __rte_unused uint32_t buf_size,
>>>>                 __rte_unused uint32_t nr_buf)
>>>>    {
>>>>    #ifdef RTE_ARCH_X86_64
>>>>         char *data;
>>>> -     struct rte_mbuf **srcs = array;
>>>> +     void **srcs = array;
>>>>         uint32_t i, offset;
>>>>
>>>>         for (i = 0; i < nr_buf; i++) {
>>>> -             data = rte_pktmbuf_mtod(srcs[i], char *);
>>>> +             data = (char *) srcs[i];
>>>>                 for (offset = 0; offset < buf_size; offset += 64)
>>>>                         __builtin_ia32_clflush(data + offset);
>>>>         }
>>>> @@ -224,8 +224,8 @@ do_dma_mem_copy(void *p)
>>>>         const uint32_t nr_buf = para->nr_buf;
>>>>         const uint16_t kick_batch = para->kick_batch;
>>>>         const uint32_t buf_size = para->buf_size;
>>>> -     struct rte_mbuf **srcs = para->srcs;
>>>> -     struct rte_mbuf **dsts = para->dsts;
>>>> +     void **srcs = para->srcs;
>>>> +     void **dsts = para->dsts;
>>>>         uint16_t nr_cpl;
>>>>         uint64_t async_cnt = 0;
>>>>         uint32_t i;
>>>> @@ -241,8 +241,12 @@ do_dma_mem_copy(void *p)
>>>>         while (1) {
>>>>                 for (i = 0; i < nr_buf; i++) {
>>>>    dma_copy:
>>>> -                     ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
>>>> -                             rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>>>> +                     ret = rte_dma_copy(dev_id,
>>>> +                                     0,
>>>> +                                     (rte_iova_t) srcs[i],
>>>> +                                     (rte_iova_t) dsts[i],
>> Thank you ChengWen for the suggestion, please find my observations below
>>> should consider IOVA != VA, so here should be with rte_mempool_virt2iova(),
>>> but this commit is mainly to eliminate the address convert overload, so we
>>> should prepare IOVA for DMA copy, and VA for memory copy.
>> yes, Ferruh helped me to understand. Please let me look into this and share a v3 soon.
>>
>>> I prefer keep pkt_mbuf, but new add two field, and create this two field when setup_memory_env(),
>>> then direct use them in do_xxx_mem_copy。
>> Please help me understand if you are suggesting, in function `setup_memory_env` we still keep pkt_mbuf creation.
>>
>> But when the arrays are created instead of populating them with mbuf, we directly call `pktmbuf_mtod` and store the
>>
>> starting address. Thus in cpu-copy or dma-copy we do not spent time in compute. Is this what you mean?
> Yes
>
>>
>> My reasoning for not using pktmbuf is as follows
>>
>> 1. pkt_mbuf has rte_mbuf metadata + private + headroom + tailroom
>>
>> 2. so when create payload for 2K, 4K, 8K, 16K, 32K, 1GB we are accounting for extra headroom. which is not efficent
>>
>> 3. dma-perf is targeted for performance and not network function.
>>
>> 4. there is an existing example which makes use pktmbuf and dma calls.
>>
>>
>> hence I would like to use mempool which also helps per numa with flags.
> What I understand the low performance, mainly due to the CPU can't take DMA device performance,
> so the CPU is a bottleneck, when we reduce the tasks of the CPU (just like this commit did), then
> the performance is improved.
>
> This commit can test the maximum performance when the CPU and DMA cowork together, so I think we can
> add this commit.
>
> pktmbuf is a popular programming entity, and almost all application (including examples) in the DPDK
> community are based on pktmbuf.
>
> I think that keeping the use of pktbuf provides a flexibility, someone who want do more operates with
> pktmbuf (maybe emulate the real logic) could be easily modified and testing.

thank you Chengwen for the comments, I have also noticed some merges on 
`benchmark.c`. Let me take that as new baseline and modify as v3 as new 
option for mempool.

>
> Thanks
>
>>
>>> Thanks.
>>>
>>>> +                                     buf_size,
>>>> +                                     0);
>>>>                         if (unlikely(ret < 0)) {
>>>>                                 if (ret == -ENOSPC) {
>>>>                                         do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
>>>> @@ -276,8 +280,8 @@ do_cpu_mem_copy(void *p)
>>>>         volatile struct worker_info *worker_info = &(para->worker_info);
>>>>         const uint32_t nr_buf = para->nr_buf;
>>>>         const uint32_t buf_size = para->buf_size;
>>>> -     struct rte_mbuf **srcs = para->srcs;
>>>> -     struct rte_mbuf **dsts = para->dsts;
>>>> +     void **srcs = para->srcs;
>>>> +     void **dsts = para->dsts;
>>>>         uint32_t i;
>>>>
>>>>         worker_info->stop_flag = false;
>>>> @@ -288,8 +292,8 @@ do_cpu_mem_copy(void *p)
>>>>
>>>>         while (1) {
>>>>                 for (i = 0; i < nr_buf; i++) {
>>>> -                     const void *src = rte_pktmbuf_mtod(dsts[i], void *);
>>>> -                     void *dst = rte_pktmbuf_mtod(srcs[i], void *);
>>>> +                     const void *src = (void *) dsts[i];
>>>> +                     void *dst = (void *) srcs[i];
>>>>
>>>>                         /* copy buffer form src to dst */
>>>>                         rte_memcpy(dst, src, (size_t)buf_size);
>>>> @@ -303,8 +307,8 @@ do_cpu_mem_copy(void *p)
>>>>    }
>>>>
>>>>    static int
>>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>> -                     struct rte_mbuf ***dsts)
>>>> +setup_memory_env(struct test_configure *cfg, void ***srcs,
>>>> +                     void ***dsts)
>>>>    {
>>>>         unsigned int buf_size = cfg->buf_size.cur;
>>>>         unsigned int nr_sockets;
>>>> @@ -317,47 +321,57 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
>>>> +     src_pool = rte_mempool_create("Benchmark_DMA_SRC",
>>>>                         nr_buf,
>>>> +                     buf_size,
>>>>                         0,
>>>>                         0,
>>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>>> -                     cfg->src_numa_node);
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     cfg->src_numa_node,
>>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>>         if (src_pool == NULL) {
>>>>                 PRINT_ERR("Error with source mempool creation.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
>>>> +     dst_pool = rte_mempool_create("Benchmark_DMA_DST",
>>>>                         nr_buf,
>>>> +                     buf_size,
>>>>                         0,
>>>>                         0,
>>>> -                     buf_size + RTE_PKTMBUF_HEADROOM,
>>>> -                     cfg->dst_numa_node);
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     NULL,
>>>> +                     cfg->dst_numa_node,
>>>> +                     RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
>>>>         if (dst_pool == NULL) {
>>>>                 PRINT_ERR("Error with destination mempool creation.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>>> +     *srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
>>>>         if (*srcs == NULL) {
>>>>                 printf("Error: srcs malloc failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
>>>> +     *dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
>>>>         if (*dsts == NULL) {
>>>>                 printf("Error: dsts malloc failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
>>>> -             printf("alloc src mbufs failed.\n");
>>>> +     if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
>>>> +             printf("alloc src bufs failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -     if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>>> -             printf("alloc dst mbufs failed.\n");
>>>> +     if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
>>>> +             printf("alloc dst bufs failed.\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> @@ -370,7 +384,7 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>>         uint16_t i;
>>>>         uint32_t offset;
>>>>         unsigned int lcore_id = 0;
>>>> -     struct rte_mbuf **srcs = NULL, **dsts = NULL;
>>>> +     void **srcs = NULL, **dsts = NULL;
>>>>         struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>>>         unsigned int buf_size = cfg->buf_size.cur;
>>>>         uint16_t kick_batch = cfg->kick_batch.cur;
>>>> @@ -478,9 +492,9 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>>>    out:
>>>>         /* free mbufs used in the test */
>>>>         if (srcs != NULL)
>>>> -             rte_pktmbuf_free_bulk(srcs, nr_buf);
>>>> +             rte_mempool_put_bulk(src_pool, srcs, nr_buf);
>>>>         if (dsts != NULL)
>>>> -             rte_pktmbuf_free_bulk(dsts, nr_buf);
>>>> +             rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
>>>>
>>>>         /* free the points for the mbufs */
>>>>         rte_free(srcs);
>>>>
>> .
  

Patch

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 9b1f58c78c..dc6f16cc01 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -43,8 +43,8 @@  struct lcore_params {
 	uint16_t kick_batch;
 	uint32_t buf_size;
 	uint16_t test_secs;
-	struct rte_mbuf **srcs;
-	struct rte_mbuf **dsts;
+	void **srcs;
+	void **dsts;
 	volatile struct worker_info worker_info;
 };
 
@@ -110,17 +110,17 @@  output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t r
 }
 
 static inline void
-cache_flush_buf(__rte_unused struct rte_mbuf **array,
+cache_flush_buf(__rte_unused void **array,
 		__rte_unused uint32_t buf_size,
 		__rte_unused uint32_t nr_buf)
 {
 #ifdef RTE_ARCH_X86_64
 	char *data;
-	struct rte_mbuf **srcs = array;
+	void **srcs = array;
 	uint32_t i, offset;
 
 	for (i = 0; i < nr_buf; i++) {
-		data = rte_pktmbuf_mtod(srcs[i], char *);
+		data = (char *) srcs[i];
 		for (offset = 0; offset < buf_size; offset += 64)
 			__builtin_ia32_clflush(data + offset);
 	}
@@ -224,8 +224,8 @@  do_dma_mem_copy(void *p)
 	const uint32_t nr_buf = para->nr_buf;
 	const uint16_t kick_batch = para->kick_batch;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint16_t nr_cpl;
 	uint64_t async_cnt = 0;
 	uint32_t i;
@@ -241,8 +241,12 @@  do_dma_mem_copy(void *p)
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
 dma_copy:
-			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
-				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
+			ret = rte_dma_copy(dev_id,
+					0,
+					(rte_iova_t) srcs[i],
+					(rte_iova_t) dsts[i],
+					buf_size,
+					0);
 			if (unlikely(ret < 0)) {
 				if (ret == -ENOSPC) {
 					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
@@ -276,8 +280,8 @@  do_cpu_mem_copy(void *p)
 	volatile struct worker_info *worker_info = &(para->worker_info);
 	const uint32_t nr_buf = para->nr_buf;
 	const uint32_t buf_size = para->buf_size;
-	struct rte_mbuf **srcs = para->srcs;
-	struct rte_mbuf **dsts = para->dsts;
+	void **srcs = para->srcs;
+	void **dsts = para->dsts;
 	uint32_t i;
 
 	worker_info->stop_flag = false;
@@ -288,8 +292,8 @@  do_cpu_mem_copy(void *p)
 
 	while (1) {
 		for (i = 0; i < nr_buf; i++) {
-			const void *src = rte_pktmbuf_mtod(dsts[i], void *);
-			void *dst = rte_pktmbuf_mtod(srcs[i], void *);
+			const void *src = (void *) dsts[i];
+			void *dst = (void *) srcs[i];
 
 			/* copy buffer form src to dst */
 			rte_memcpy(dst, src, (size_t)buf_size);
@@ -303,8 +307,8 @@  do_cpu_mem_copy(void *p)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg, void ***srcs,
+			void ***dsts)
 {
 	unsigned int buf_size = cfg->buf_size.cur;
 	unsigned int nr_sockets;
@@ -317,47 +321,57 @@  setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		return -1;
 	}
 
-	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+	src_pool = rte_mempool_create("Benchmark_DMA_SRC",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->src_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->src_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
 		return -1;
 	}
 
-	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+	dst_pool = rte_mempool_create("Benchmark_DMA_DST",
 			nr_buf,
+			buf_size,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
-			cfg->dst_numa_node);
+			NULL,
+			NULL,
+			NULL,
+			NULL,
+			cfg->dst_numa_node,
+			RTE_MEMPOOL_F_SP_PUT | RTE_MEMPOOL_F_SC_GET);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
 		return -1;
 	}
 
-	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*srcs = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->src_numa_node);
 	if (*srcs == NULL) {
 		printf("Error: srcs malloc failed.\n");
 		return -1;
 	}
 
-	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	*dsts = rte_malloc_socket(NULL, nr_buf * sizeof(unsigned char *), 0, cfg->dst_numa_node);
 	if (*dsts == NULL) {
 		printf("Error: dsts malloc failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
-		printf("alloc src mbufs failed.\n");
+	if (rte_mempool_get_bulk(src_pool, *srcs, nr_buf) != 0) {
+		printf("alloc src bufs failed.\n");
 		return -1;
 	}
 
-	if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
-		printf("alloc dst mbufs failed.\n");
+	if (rte_mempool_get_bulk(dst_pool, *dsts, nr_buf) != 0) {
+		printf("alloc dst bufs failed.\n");
 		return -1;
 	}
 
@@ -370,7 +384,7 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint16_t i;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
-	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	void **srcs = NULL, **dsts = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
@@ -478,9 +492,9 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 out:
 	/* free mbufs used in the test */
 	if (srcs != NULL)
-		rte_pktmbuf_free_bulk(srcs, nr_buf);
+		rte_mempool_put_bulk(src_pool, srcs, nr_buf);
 	if (dsts != NULL)
-		rte_pktmbuf_free_bulk(dsts, nr_buf);
+		rte_mempool_put_bulk(dst_pool, dsts, nr_buf);
 
 	/* free the points for the mbufs */
 	rte_free(srcs);