[v5] app/dma-perf: introduce dma-perf application

Message ID 20230608084355.16209-1-cheng1.jiang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v5] app/dma-perf: introduce dma-perf application |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/github-robot: build success github build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-aarch-unit-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-unit-testing success Testing PASS
ci/iol-abi-testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Functional success Functional PASS
ci/intel-Testing success Testing PASS

Commit Message

Jiang, Cheng1 June 8, 2023, 8:43 a.m. UTC
  There are many high-performance DMA devices supported in DPDK now, and
these DMA devices can also be integrated into other modules of DPDK as
accelerators, such as Vhost. Before integrating DMA into applications,
developers need to know the performance of these DMA devices in various
scenarios and the performance of CPUs in the same scenario, such as
different buffer lengths. Only in this way can we know the target
performance of the application accelerated by using them. This patch
introduces a high-performance testing tool, which supports comparing the
performance of CPU and DMA in different scenarios automatically with a
pre-set config file. Memory Copy performance test are supported for now.

Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Chenbo Xia <chenbo.xia@intel.com>
---
v5:
  fixed some LONG_LINE warnings;
v4:
  fixed inaccuracy of the memory footprint display;
v3:
  fixed some typos;
v2:
  added lcore/dmadev designation;
  added error case process;
  removed worker_threads parameter from config.ini;
  improved the logs;
  improved config file;

 app/meson.build               |   1 +
 app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
 app/test-dma-perf/config.ini  |  59 ++++
 app/test-dma-perf/main.c      | 569 ++++++++++++++++++++++++++++++++++
 app/test-dma-perf/main.h      |  69 +++++
 app/test-dma-perf/meson.build |  17 +
 6 files changed, 1187 insertions(+)
 create mode 100644 app/test-dma-perf/benchmark.c
 create mode 100644 app/test-dma-perf/config.ini
 create mode 100644 app/test-dma-perf/main.c
 create mode 100644 app/test-dma-perf/main.h
 create mode 100644 app/test-dma-perf/meson.build

--
2.40.1
  

Comments

Anoob Joseph June 9, 2023, 11:44 a.m. UTC | #1
Hi,

Thanks for adding the app. Few comments inline. Please check.

Thanks,
Anoob

> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Thursday, June 8, 2023 2:14 PM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1187 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build
> 

<snip>

> +
> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> +	uint16_t vchan = 0;
> +	struct rte_dma_info info;
> +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };

[Anoob] Is it possible to use more vchans? The code launches as many threads as the number of dma devices. Instead it should be total number of vchans.

> +	struct rte_dma_vchan_conf qconf = {
> +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> +		.nb_desc = ring_size
> +	};
> +
> +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> +
> +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> +
> +	rte_dma_info_get(dev_id, &info);
> +	if (info.nb_vchans != 1)
> +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> reported on device id. %u\n",
> +				dev_id);
> +
> +	if (rte_dma_start(dev_id) != 0)
> +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> +
> +

<snip>

> +static inline int
> +do_dma_mem_copy(void *p)
> +{
> +	uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint16_t kick_batch = para->kick_batch;

[Anoob] Some of these variables can be made const. Since this is fast path, might be beneficial doing that way.

> +	uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +					worker_info->total_cpl += nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n",
> i);
> +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");
> +				}

[Anoob] Only if the API returns -ENOSPC we should retry submitting, right? Other errors should be treated as fatal errors.

Do we need to use rte_dma_burst_capacity() API?

Can't we try something like,

dma_copy:
		ret =  rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]), rte_pktmbuf_iova(dsts[i]), buf_size, 0);
		if (unlikely (ret < 0) {
			if (ret == -ENOSPC) {
				rte_dma_submit(dev_id, 0);
				/* DMA completed & other handling */
				goto dma_copy;
			} else {
				/* Error exit */
			}
		}

			
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0) {
> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +				worker_info->total_cpl += nr_cpl;

[Anoob] Above code can be made as a static inline function so that in cases rte_dma_copy returns -ENOSPC, same static inline can be called.

<snip>
  
Amit Prakash Shukla June 9, 2023, 2:03 p.m. UTC | #2
> -----Original Message-----
> From: Cheng Jiang <cheng1.jiang@intel.com>
> Sent: Thursday, June 8, 2023 2:14 PM
> To: thomas@monjalon.net; bruce.richardson@intel.com;
> mb@smartsharesystems.com; chenbo.xia@intel.com
> Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> Cheng Jiang <cheng1.jiang@intel.com>
> Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> External Email
> 
> ----------------------------------------------------------------------
> There are many high-performance DMA devices supported in DPDK now,
> and these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a pre-
> set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> ---
> v5:
>   fixed some LONG_LINE warnings;
> v4:
>   fixed inaccuracy of the memory footprint display;
> v3:
>   fixed some typos;
> v2:
>   added lcore/dmadev designation;
>   added error case process;
>   removed worker_threads parameter from config.ini;
>   improved the logs;
>   improved config file;
> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> app/test-dma-perf/config.ini  |  59 ++++
>  app/test-dma-perf/main.c      | 569
> ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  69 +++++
>  app/test-dma-perf/meson.build |  17 +
>  6 files changed, 1187 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c  create mode 100644
> app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> 100644 app/test-dma-perf/meson.build
> 

<snip>

> +
> +static inline int
> +do_dma_mem_copy(void *p)
> +{
> +	uint16_t *para_idx = (uint16_t *)p;
> +	volatile struct lcore_params *para = worker_params[*para_idx];
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint16_t kick_batch = para->kick_batch;
> +	uint32_t buf_size = para->buf_size;
> +	struct rte_mbuf **srcs = para->srcs;
> +	struct rte_mbuf **dsts = para->dsts;
> +	int64_t async_cnt = 0;
> +	int nr_cpl = 0;
> +	uint32_t i;
> +	uint32_t poll_cnt = 0;
> +
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		for (i = 0; i < nr_buf; i++) {
> +			if (unlikely(rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0)) {
> +				rte_dma_submit(dev_id, 0);
> +				while (rte_dma_burst_capacity(dev_id, 0) ==
> 0) {
> +					nr_cpl = rte_dma_completed(dev_id,
> 0, MAX_DMA_CPL_NB,
> +								NULL, NULL);
> +					async_cnt -= nr_cpl;
> +					worker_info->total_cpl += nr_cpl;
> +				}
> +				if (rte_dma_copy(dev_id,
> +						0,
> +						rte_pktmbuf_iova(srcs[i]),
> +						rte_pktmbuf_iova(dsts[i]),
> +						buf_size,
> +						0) < 0) {
> +					printf("enqueue fail again at %u\n",
> i);
> +					printf("space:%d\n",
> rte_dma_burst_capacity(dev_id, 0));
> +					rte_exit(EXIT_FAILURE, "DMA
> enqueue failed\n");

[Amit]: On all success or failure exits, please call rte_dma_stop and rte_dma_close to exit cleanly.

> +				}
> +			}
> +			async_cnt++;
> +
> +			if ((async_cnt % kick_batch) == 0) {
> +				rte_dma_submit(dev_id, 0);
> +				/* add a poll to avoid ring full */
> +				nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +				async_cnt -= nr_cpl;
> +				worker_info->total_cpl += nr_cpl;
> +			}
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0,
> MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +

<snip>

> +
> +void
> +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> +	uint16_t i;
> +	uint32_t offset;
> +	unsigned int lcore_id = 0;
> +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	unsigned int buf_size = cfg->buf_size.cur;
> +	uint16_t kick_batch = cfg->kick_batch.cur;
> +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> +	uint16_t nb_workers = ldm->cnt;
> +	uint16_t test_secs = cfg->test_secs;
> +	float memory;
> +	uint32_t avg_cycles = 0;
> +	float mops;
> +	float bandwidth;
> +
> +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +		goto out;
> +
> +	if (is_dma)
> +		if (config_dmadevs(cfg) < 0)
> +			goto out;
> +
> +	if (cfg->cache_flush) {
> +		cache_flush_buf(srcs, buf_size, nr_buf);
> +		cache_flush_buf(dsts, buf_size, nr_buf);
> +		rte_mb();
> +	}
> +
> +	printf("Start testing....\n");
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		lcore_id = ldm->lcores[i];
> +		offset = nr_buf / nb_workers * i;
> +
> +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> lcore_params), 0);
> +		if (!worker_params[i]) {
> +			printf("lcore parameters malloc failure for lcore
> %d\n", lcore_id);
> +			break;
> +		}
> +		if (is_dma) {
> +			worker_params[i]->dma_name = ldm-
> >dma_names[i];
> +			worker_params[i]->dev_id = ldm->dma_ids[i];
> +			worker_params[i]->kick_batch = kick_batch;
> +		}
> +		worker_params[i]->worker_id = i;
> +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> nb_workers);
> +		worker_params[i]->buf_size = buf_size;
> +		worker_params[i]->test_secs = test_secs;
> +		worker_params[i]->srcs = srcs + offset;
> +		worker_params[i]->dsts = dsts + offset;
> +		worker_params[i]->scenario_id = cfg->scenario_id;
> +		worker_params[i]->lcore_id = lcore_id;
> +
> +		if (is_dma)
> +			rte_eal_remote_launch(do_dma_mem_copy, (void
> *)(&i), lcore_id);
> +		else
> +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> *)(&i), lcore_id);
> +	}
> +
> +	while (1) {
> +		bool ready = true;
> +		for (i = 0; i < nb_workers; i++) {
> +			if (worker_params[i]->worker_info.ready_flag ==
> false) {
> +				ready = 0;
> +				break;
> +			}
> +		}
> +		if (ready)
> +			break;
> +	}
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.start_flag = true;
> +
> +	usleep(TEST_WAIT_U_SECOND);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> +worker_params[i]->worker_info.total_cpl;
> +
> +	usleep(test_secs * 1000 * 1000);
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.test_cpl =
> worker_params[i]->worker_info.total_cpl -
> +						worker_params[i]-
> >worker_info.test_cpl;
> +
> +	for (i = 0; i < nb_workers; i++)
> +		worker_params[i]->worker_info.stop_flag = true;
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	for (i = 0; i < nb_workers; i++) {
> +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> +			worker_params[i]->worker_info.test_cpl,
> +			&memory, &avg_cycles, &bandwidth, &mops);
> +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> +					worker_params[i]->dma_name,
> avg_cycles, buf_size,
> +					nr_buf / nb_workers, memory,
> bandwidth, mops, is_dma);
> +	}
> +
> +out:
> +	/* free env */
> +	if (srcs)
> +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> +	if (dsts)
> +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> +
> +	if (src_pool)
> +		rte_mempool_free(src_pool);
> +	if (dst_pool)
> +		rte_mempool_free(dst_pool);
> +
> +	if (is_dma) {
> +		for (i = 0; i < nb_workers; i++) {
> +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> +			rte_dma_stop(ldm->dma_ids[i]);

[Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.

<snip>

> +#endif /* _MAIN_H_ */
> diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> --- /dev/null
> +++ b/app/test-dma-perf/meson.build
> @@ -0,0 +1,17 @@
> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023 Intel
> +Corporation
> +
> +# meson file, for building this app as part of a main DPDK build.
> +
> +if is_windows
> +    build = false
> +    reason = 'not supported on Windows'
> +    subdir_done()
> +endif
> +
> +deps += ['dmadev', 'mbuf', 'cfgfile']
> +
> +sources = files(
> +        'main.c',
> +        'benchmark.c',
> +)
> --
> 2.40.1
  
Jiang, Cheng1 June 12, 2023, 7:40 a.m. UTC | #3
Hi,

Thanks for your comments, the replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Anoob Joseph <anoobj@marvell.com>
> Sent: Friday, June 9, 2023 7:44 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; Amit Prakash Shukla
> <amitprakashs@marvell.com>; Satha Koteswara Rao Kottidi
> <skoteshwar@marvell.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>; Vidya Sagar Velumuri
> <vvelumuri@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for adding the app. Few comments inline. Please check.
> 
> Thanks,
> Anoob
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Thursday, June 8, 2023 2:14 PM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test
> are supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1187 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode
> 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build
> >
> 
> <snip>
> 
> > +
> > +/* Configuration of device. */
> > +static void
> > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) {
> > +	uint16_t vchan = 0;
> > +	struct rte_dma_info info;
> > +	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> 
> [Anoob] Is it possible to use more vchans? The code launches as many
> threads as the number of dma devices. Instead it should be total number of
> vchans.

[Cheng] Really good suggestion. This is feasible, but in the initial stage, we want to keep things simple. Perhaps in the future, we can add a parameter to configure the number of vchans for each device and then launch the corresponding number of threads for each vchan.

> 
> > +	struct rte_dma_vchan_conf qconf = {
> > +		.direction = RTE_DMA_DIR_MEM_TO_MEM,
> > +		.nb_desc = ring_size
> > +	};
> > +
> > +	if (rte_dma_configure(dev_id, &dev_config) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
> > +
> > +	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
> > +
> > +	rte_dma_info_get(dev_id, &info);
> > +	if (info.nb_vchans != 1)
> > +		rte_exit(EXIT_FAILURE, "Error, no configured queues
> > reported on device id. %u\n",
> > +				dev_id);
> > +
> > +	if (rte_dma_start(dev_id) != 0)
> > +		rte_exit(EXIT_FAILURE, "Error with dma start.\n"); }
> > +
> > +
> 
> <snip>
> 
> > +static inline int
> > +do_dma_mem_copy(void *p)
> > +{
> > +	uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint16_t kick_batch = para->kick_batch;
> 
> [Anoob] Some of these variables can be made const. Since this is fast path,
> might be beneficial doing that way.

[Cheng] Good idea, I'll improve it in the next version.

> 
> > +	uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> > 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +					worker_info->total_cpl += nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> > i);
> > +					printf("space:%d\n",
> > rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> > +				}
> 
> [Anoob] Only if the API returns -ENOSPC we should retry submitting, right?
> Other errors should be treated as fatal errors.
> 
> Do we need to use rte_dma_burst_capacity() API?
> 
> Can't we try something like,
> 
> dma_copy:
> 		ret =  rte_dma_copy(dev_id, 0, rte_pktmbuf_iova(srcs[i]),
> rte_pktmbuf_iova(dsts[i]), buf_size, 0);
> 		if (unlikely (ret < 0) {
> 			if (ret == -ENOSPC) {
> 				rte_dma_submit(dev_id, 0);
> 				/* DMA completed & other handling */
> 				goto dma_copy;
> 			} else {
> 				/* Error exit */
> 			}
> 		}
> 
> 

[Cheng] Good idea, we don't have to check the capacity explicitly. I think your implementation is more clear, thanks. I will fix it in the next version.

> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +				worker_info->total_cpl += nr_cpl;
> 
> [Anoob] Above code can be made as a static inline function so that in cases
> rte_dma_copy returns -ENOSPC, same static inline can be called.
> 

[Cheng] sure, got it. Thanks!

> <snip>
  
Jiang, Cheng1 June 12, 2023, 8:26 a.m. UTC | #4
Hi,

Thanks for your comments, replies are inline.

Thanks,
Cheng

> -----Original Message-----
> From: Amit Prakash Shukla <amitprakashs@marvell.com>
> Sent: Friday, June 9, 2023 10:03 PM
> To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> 
> 
> > -----Original Message-----
> > From: Cheng Jiang <cheng1.jiang@intel.com>
> > Sent: Thursday, June 8, 2023 2:14 PM
> > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > mb@smartsharesystems.com; chenbo.xia@intel.com
> > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > wenwux.ma@intel.com; yuanx.wang@intel.com; xingguang.he@intel.com;
> > Cheng Jiang <cheng1.jiang@intel.com>
> > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > There are many high-performance DMA devices supported in DPDK now,
> and
> > these DMA devices can also be integrated into other modules of DPDK as
> > accelerators, such as Vhost. Before integrating DMA into applications,
> > developers need to know the performance of these DMA devices in
> > various scenarios and the performance of CPUs in the same scenario,
> > such as different buffer lengths. Only in this way can we know the
> > target performance of the application accelerated by using them. This
> > patch introduces a high-performance testing tool, which supports
> > comparing the performance of CPU and DMA in different scenarios
> > automatically with a pre- set config file. Memory Copy performance test
> are supported for now.
> >
> > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > ---
> > v5:
> >   fixed some LONG_LINE warnings;
> > v4:
> >   fixed inaccuracy of the memory footprint display;
> > v3:
> >   fixed some typos;
> > v2:
> >   added lcore/dmadev designation;
> >   added error case process;
> >   removed worker_threads parameter from config.ini;
> >   improved the logs;
> >   improved config file;
> >
> >  app/meson.build               |   1 +
> >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > app/test-dma-perf/config.ini  |  59 ++++
> >  app/test-dma-perf/main.c      | 569
> > ++++++++++++++++++++++++++++++++++
> >  app/test-dma-perf/main.h      |  69 +++++
> >  app/test-dma-perf/meson.build |  17 +
> >  6 files changed, 1187 insertions(+)
> >  create mode 100644 app/test-dma-perf/benchmark.c  create mode
> 100644
> > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create mode
> > 100644 app/test-dma-perf/meson.build
> >
> 
> <snip>
> 
> > +
> > +static inline int
> > +do_dma_mem_copy(void *p)
> > +{
> > +	uint16_t *para_idx = (uint16_t *)p;
> > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint16_t kick_batch = para->kick_batch;
> > +	uint32_t buf_size = para->buf_size;
> > +	struct rte_mbuf **srcs = para->srcs;
> > +	struct rte_mbuf **dsts = para->dsts;
> > +	int64_t async_cnt = 0;
> > +	int nr_cpl = 0;
> > +	uint32_t i;
> > +	uint32_t poll_cnt = 0;
> > +
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		for (i = 0; i < nr_buf; i++) {
> > +			if (unlikely(rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0)) {
> > +				rte_dma_submit(dev_id, 0);
> > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > 0) {
> > +					nr_cpl = rte_dma_completed(dev_id,
> > 0, MAX_DMA_CPL_NB,
> > +								NULL, NULL);
> > +					async_cnt -= nr_cpl;
> > +					worker_info->total_cpl += nr_cpl;
> > +				}
> > +				if (rte_dma_copy(dev_id,
> > +						0,
> > +						rte_pktmbuf_iova(srcs[i]),
> > +						rte_pktmbuf_iova(dsts[i]),
> > +						buf_size,
> > +						0) < 0) {
> > +					printf("enqueue fail again at %u\n",
> > i);
> > +					printf("space:%d\n",
> > rte_dma_burst_capacity(dev_id, 0));
> > +					rte_exit(EXIT_FAILURE, "DMA
> > enqueue failed\n");
> 
> [Amit]: On all success or failure exits, please call rte_dma_stop and
> rte_dma_close to exit cleanly.

[Cheng] Got it. Thanks, I'll fix it in the next version.

> 
> > +				}
> > +			}
> > +			async_cnt++;
> > +
> > +			if ((async_cnt % kick_batch) == 0) {
> > +				rte_dma_submit(dev_id, 0);
> > +				/* add a poll to avoid ring full */
> > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +				async_cnt -= nr_cpl;
> > +				worker_info->total_cpl += nr_cpl;
> > +			}
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > MAX_DMA_CPL_NB, NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> <snip>
> 
> > +
> > +void
> > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > +	uint16_t i;
> > +	uint32_t offset;
> > +	unsigned int lcore_id = 0;
> > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	unsigned int buf_size = cfg->buf_size.cur;
> > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> > (cfg->buf_size.cur * 2);
> > +	uint16_t nb_workers = ldm->cnt;
> > +	uint16_t test_secs = cfg->test_secs;
> > +	float memory;
> > +	uint32_t avg_cycles = 0;
> > +	float mops;
> > +	float bandwidth;
> > +
> > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +		goto out;
> > +
> > +	if (is_dma)
> > +		if (config_dmadevs(cfg) < 0)
> > +			goto out;
> > +
> > +	if (cfg->cache_flush) {
> > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > +		rte_mb();
> > +	}
> > +
> > +	printf("Start testing....\n");
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		lcore_id = ldm->lcores[i];
> > +		offset = nr_buf / nb_workers * i;
> > +
> > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > lcore_params), 0);
> > +		if (!worker_params[i]) {
> > +			printf("lcore parameters malloc failure for lcore
> > %d\n", lcore_id);
> > +			break;
> > +		}
> > +		if (is_dma) {
> > +			worker_params[i]->dma_name = ldm-
> > >dma_names[i];
> > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > +			worker_params[i]->kick_batch = kick_batch;
> > +		}
> > +		worker_params[i]->worker_id = i;
> > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > nb_workers);
> > +		worker_params[i]->buf_size = buf_size;
> > +		worker_params[i]->test_secs = test_secs;
> > +		worker_params[i]->srcs = srcs + offset;
> > +		worker_params[i]->dsts = dsts + offset;
> > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > +		worker_params[i]->lcore_id = lcore_id;
> > +
> > +		if (is_dma)
> > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > *)(&i), lcore_id);
> > +		else
> > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > *)(&i), lcore_id);
> > +	}
> > +
> > +	while (1) {
> > +		bool ready = true;
> > +		for (i = 0; i < nb_workers; i++) {
> > +			if (worker_params[i]->worker_info.ready_flag ==
> > false) {
> > +				ready = 0;
> > +				break;
> > +			}
> > +		}
> > +		if (ready)
> > +			break;
> > +	}
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.start_flag = true;
> > +
> > +	usleep(TEST_WAIT_U_SECOND);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > +worker_params[i]->worker_info.total_cpl;
> > +
> > +	usleep(test_secs * 1000 * 1000);
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.test_cpl =
> > worker_params[i]->worker_info.total_cpl -
> > +						worker_params[i]-
> > >worker_info.test_cpl;
> > +
> > +	for (i = 0; i < nb_workers; i++)
> > +		worker_params[i]->worker_info.stop_flag = true;
> > +
> > +	rte_eal_mp_wait_lcore();
> > +
> > +	for (i = 0; i < nb_workers; i++) {
> > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > +			worker_params[i]->worker_info.test_cpl,
> > +			&memory, &avg_cycles, &bandwidth, &mops);
> > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > +					worker_params[i]->dma_name,
> > avg_cycles, buf_size,
> > +					nr_buf / nb_workers, memory,
> > bandwidth, mops, is_dma);
> > +	}
> > +
> > +out:
> > +	/* free env */
> > +	if (srcs)
> > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > +	if (dsts)
> > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > +
> > +	if (src_pool)
> > +		rte_mempool_free(src_pool);
> > +	if (dst_pool)
> > +		rte_mempool_free(dst_pool);
> > +
> > +	if (is_dma) {
> > +		for (i = 0; i < nb_workers; i++) {
> > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > +			rte_dma_stop(ldm->dma_ids[i]);
> 
> [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> 

[Cheng] Sure, I'll fix it in the next version.

> <snip>
> 
> > +#endif /* _MAIN_H_ */
> > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > --- /dev/null
> > +++ b/app/test-dma-perf/meson.build
> > @@ -0,0 +1,17 @@
> > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > +Intel Corporation
> > +
> > +# meson file, for building this app as part of a main DPDK build.
> > +
> > +if is_windows
> > +    build = false
> > +    reason = 'not supported on Windows'
> > +    subdir_done()
> > +endif
> > +
> > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > +
> > +sources = files(
> > +        'main.c',
> > +        'benchmark.c',
> > +)
> > --
> > 2.40.1
  
Jiang, Cheng1 June 13, 2023, 4:51 a.m. UTC | #5
Hi,

Replies are inline.

> -----Original Message-----
> From: Jiang, Cheng1
> Sent: Monday, June 12, 2023 4:27 PM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <Chenbo.Xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> <Xuan.Ding@intel.com>; Ma, WenwuX <WenwuX.Ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Thanks for your comments, replies are inline.
> 
> Thanks,
> Cheng
> 
> > -----Original Message-----
> > From: Amit Prakash Shukla <amitprakashs@marvell.com>
> > Sent: Friday, June 9, 2023 10:03 PM
> > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > <anoobj@marvell.com>
> > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > application
> >
> >
> >
> > > -----Original Message-----
> > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > Sent: Thursday, June 8, 2023 2:14 PM
> > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > mb@smartsharesystems.com; chenbo.xia@intel.com
> > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> xingguang.he@intel.com;
> > > Cheng Jiang <cheng1.jiang@intel.com>
> > > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > application
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > -- There are many high-performance DMA devices supported in DPDK
> > > now,
> > and
> > > these DMA devices can also be integrated into other modules of DPDK
> > > as accelerators, such as Vhost. Before integrating DMA into
> > > applications, developers need to know the performance of these DMA
> > > devices in various scenarios and the performance of CPUs in the same
> > > scenario, such as different buffer lengths. Only in this way can we
> > > know the target performance of the application accelerated by using
> > > them. This patch introduces a high-performance testing tool, which
> > > supports comparing the performance of CPU and DMA in different
> > > scenarios automatically with a pre- set config file. Memory Copy
> > > performance test
> > are supported for now.
> > >
> > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > ---
> > > v5:
> > >   fixed some LONG_LINE warnings;
> > > v4:
> > >   fixed inaccuracy of the memory footprint display;
> > > v3:
> > >   fixed some typos;
> > > v2:
> > >   added lcore/dmadev designation;
> > >   added error case process;
> > >   removed worker_threads parameter from config.ini;
> > >   improved the logs;
> > >   improved config file;
> > >
> > >  app/meson.build               |   1 +
> > >  app/test-dma-perf/benchmark.c | 472 ++++++++++++++++++++++++++++
> > > app/test-dma-perf/config.ini  |  59 ++++
> > >  app/test-dma-perf/main.c      | 569
> > > ++++++++++++++++++++++++++++++++++
> > >  app/test-dma-perf/main.h      |  69 +++++
> > >  app/test-dma-perf/meson.build |  17 +
> > >  6 files changed, 1187 insertions(+)  create mode 100644
> > > app/test-dma-perf/benchmark.c  create mode
> > 100644
> > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > mode
> > > 100644 app/test-dma-perf/meson.build
> > >
> >
> > <snip>
> >
> > > +
> > > +static inline int
> > > +do_dma_mem_copy(void *p)
> > > +{
> > > +	uint16_t *para_idx = (uint16_t *)p;
> > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > +	uint16_t dev_id = para->dev_id;
> > > +	uint32_t nr_buf = para->nr_buf;
> > > +	uint16_t kick_batch = para->kick_batch;
> > > +	uint32_t buf_size = para->buf_size;
> > > +	struct rte_mbuf **srcs = para->srcs;
> > > +	struct rte_mbuf **dsts = para->dsts;
> > > +	int64_t async_cnt = 0;
> > > +	int nr_cpl = 0;
> > > +	uint32_t i;
> > > +	uint32_t poll_cnt = 0;
> > > +
> > > +	worker_info->stop_flag = false;
> > > +	worker_info->ready_flag = true;
> > > +
> > > +	while (!worker_info->start_flag)
> > > +		;
> > > +
> > > +	while (1) {
> > > +		for (i = 0; i < nr_buf; i++) {
> > > +			if (unlikely(rte_dma_copy(dev_id,
> > > +						0,
> > > +						rte_pktmbuf_iova(srcs[i]),
> > > +						rte_pktmbuf_iova(dsts[i]),
> > > +						buf_size,
> > > +						0) < 0)) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > > 0) {
> > > +					nr_cpl = rte_dma_completed(dev_id,
> > > 0, MAX_DMA_CPL_NB,
> > > +								NULL, NULL);
> > > +					async_cnt -= nr_cpl;
> > > +					worker_info->total_cpl += nr_cpl;
> > > +				}
> > > +				if (rte_dma_copy(dev_id,
> > > +						0,
> > > +						rte_pktmbuf_iova(srcs[i]),
> > > +						rte_pktmbuf_iova(dsts[i]),
> > > +						buf_size,
> > > +						0) < 0) {
> > > +					printf("enqueue fail again at %u\n",
> > > i);
> > > +					printf("space:%d\n",
> > > rte_dma_burst_capacity(dev_id, 0));
> > > +					rte_exit(EXIT_FAILURE, "DMA
> > > enqueue failed\n");
> >
> > [Amit]: On all success or failure exits, please call rte_dma_stop and
> > rte_dma_close to exit cleanly.
> 
> [Cheng] Got it. Thanks, I'll fix it in the next version.

[Cheng] Hi, I take a look into it, and find out rte_exit() will call rte_dma_close(), so I think there is no need to do it explicitly here. What do you think?

> 
> >
> > > +				}
> > > +			}
> > > +			async_cnt++;
> > > +
> > > +			if ((async_cnt % kick_batch) == 0) {
> > > +				rte_dma_submit(dev_id, 0);
> > > +				/* add a poll to avoid ring full */
> > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > +				async_cnt -= nr_cpl;
> > > +				worker_info->total_cpl += nr_cpl;
> > > +			}
> > > +		}
> > > +
> > > +		if (worker_info->stop_flag)
> > > +			break;
> > > +	}
> > > +
> > > +	rte_dma_submit(dev_id, 0);
> > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > MAX_DMA_CPL_NB, NULL, NULL);
> > > +		async_cnt -= nr_cpl;
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> >
> > <snip>
> >
> > > +
> > > +void
> > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > +	uint16_t i;
> > > +	uint32_t offset;
> > > +	unsigned int lcore_id = 0;
> > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024)
> > > +/
> > > (cfg->buf_size.cur * 2);
> > > +	uint16_t nb_workers = ldm->cnt;
> > > +	uint16_t test_secs = cfg->test_secs;
> > > +	float memory;
> > > +	uint32_t avg_cycles = 0;
> > > +	float mops;
> > > +	float bandwidth;
> > > +
> > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > +		goto out;
> > > +
> > > +	if (is_dma)
> > > +		if (config_dmadevs(cfg) < 0)
> > > +			goto out;
> > > +
> > > +	if (cfg->cache_flush) {
> > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > +		rte_mb();
> > > +	}
> > > +
> > > +	printf("Start testing....\n");
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		lcore_id = ldm->lcores[i];
> > > +		offset = nr_buf / nb_workers * i;
> > > +
> > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > lcore_params), 0);
> > > +		if (!worker_params[i]) {
> > > +			printf("lcore parameters malloc failure for lcore
> > > %d\n", lcore_id);
> > > +			break;
> > > +		}
> > > +		if (is_dma) {
> > > +			worker_params[i]->dma_name = ldm-
> > > >dma_names[i];
> > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > +			worker_params[i]->kick_batch = kick_batch;
> > > +		}
> > > +		worker_params[i]->worker_id = i;
> > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > nb_workers);
> > > +		worker_params[i]->buf_size = buf_size;
> > > +		worker_params[i]->test_secs = test_secs;
> > > +		worker_params[i]->srcs = srcs + offset;
> > > +		worker_params[i]->dsts = dsts + offset;
> > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > +		worker_params[i]->lcore_id = lcore_id;
> > > +
> > > +		if (is_dma)
> > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +		else
> > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > *)(&i), lcore_id);
> > > +	}
> > > +
> > > +	while (1) {
> > > +		bool ready = true;
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > false) {
> > > +				ready = 0;
> > > +				break;
> > > +			}
> > > +		}
> > > +		if (ready)
> > > +			break;
> > > +	}
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.start_flag = true;
> > > +
> > > +	usleep(TEST_WAIT_U_SECOND);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > +worker_params[i]->worker_info.total_cpl;
> > > +
> > > +	usleep(test_secs * 1000 * 1000);
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.test_cpl =
> > > worker_params[i]->worker_info.total_cpl -
> > > +						worker_params[i]-
> > > >worker_info.test_cpl;
> > > +
> > > +	for (i = 0; i < nb_workers; i++)
> > > +		worker_params[i]->worker_info.stop_flag = true;
> > > +
> > > +	rte_eal_mp_wait_lcore();
> > > +
> > > +	for (i = 0; i < nb_workers; i++) {
> > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > +			worker_params[i]->worker_info.test_cpl,
> > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > +					worker_params[i]->dma_name,
> > > avg_cycles, buf_size,
> > > +					nr_buf / nb_workers, memory,
> > > bandwidth, mops, is_dma);
> > > +	}
> > > +
> > > +out:
> > > +	/* free env */
> > > +	if (srcs)
> > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > +	if (dsts)
> > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > +
> > > +	if (src_pool)
> > > +		rte_mempool_free(src_pool);
> > > +	if (dst_pool)
> > > +		rte_mempool_free(dst_pool);
> > > +
> > > +	if (is_dma) {
> > > +		for (i = 0; i < nb_workers; i++) {
> > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > +			rte_dma_stop(ldm->dma_ids[i]);
> >
> > [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> >
> 
> [Cheng] Sure, I'll fix it in the next version.

[Cheng] We are not planning to exit here, we are just going to do the next test, so we shouldn't call the rte_dma_close(). Just clarify. And when we finished all the test, we will call the rte_eal_cleanup() which will call the rte_dma_close(). Thanks!

> 
> > <snip>
> >
> > > +#endif /* _MAIN_H_ */
> > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > --- /dev/null
> > > +++ b/app/test-dma-perf/meson.build
> > > @@ -0,0 +1,17 @@
> > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > +Intel Corporation
> > > +
> > > +# meson file, for building this app as part of a main DPDK build.
> > > +
> > > +if is_windows
> > > +    build = false
> > > +    reason = 'not supported on Windows'
> > > +    subdir_done()
> > > +endif
> > > +
> > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > +
> > > +sources = files(
> > > +        'main.c',
> > > +        'benchmark.c',
> > > +)
> > > --
> > > 2.40.1
  
Amit Prakash Shukla June 13, 2023, 7:34 a.m. UTC | #6
Hi Cheng,

My replies are inline.

Thanks,
Amit  Shukla

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.jiang@intel.com>
> Sent: Tuesday, June 13, 2023 10:21 AM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> YuanX <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>
> Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf application
> 
> Hi,
> 
> Replies are inline.
> 
> > -----Original Message-----
> > From: Jiang, Cheng1
> > Sent: Monday, June 12, 2023 4:27 PM
> > To: Amit Prakash Shukla <amitprakashs@marvell.com>;
> > thomas@monjalon.net; Richardson, Bruce <bruce.richardson@intel.com>;
> > mb@smartsharesystems.com; Xia, Chenbo <Chenbo.Xia@intel.com>
> > Cc: dev@dpdk.org; Hu, Jiayu <Jiayu.Hu@intel.com>; Ding, Xuan
> > <Xuan.Ding@intel.com>; Ma, WenwuX <WenwuX.Ma@intel.com>; Wang,
> YuanX
> > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > <anoobj@marvell.com>
> > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > application
> >
> > Hi,
> >
> > Thanks for your comments, replies are inline.
> >
> > Thanks,
> > Cheng
> >
> > > -----Original Message-----
> > > From: Amit Prakash Shukla <amitprakashs@marvell.com>
> > > Sent: Friday, June 9, 2023 10:03 PM
> > > To: Jiang, Cheng1 <cheng1.jiang@intel.com>; thomas@monjalon.net;
> > > Richardson, Bruce <bruce.richardson@intel.com>;
> > > mb@smartsharesystems.com; Xia, Chenbo <chenbo.xia@intel.com>
> > > Cc: dev@dpdk.org; Hu, Jiayu <jiayu.hu@intel.com>; Ding, Xuan
> > > <xuan.ding@intel.com>; Ma, WenwuX <wenwux.ma@intel.com>; Wang,
> > YuanX
> > > <yuanx.wang@intel.com>; He, Xingguang <xingguang.he@intel.com>;
> > > Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Anoob Joseph
> > > <anoobj@marvell.com>
> > > Subject: RE: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > application
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Sent: Thursday, June 8, 2023 2:14 PM
> > > > To: thomas@monjalon.net; bruce.richardson@intel.com;
> > > > mb@smartsharesystems.com; chenbo.xia@intel.com
> > > > Cc: dev@dpdk.org; jiayu.hu@intel.com; xuan.ding@intel.com;
> > > > wenwux.ma@intel.com; yuanx.wang@intel.com;
> > xingguang.he@intel.com;
> > > > Cheng Jiang <cheng1.jiang@intel.com>
> > > > Subject: [EXT] [PATCH v5] app/dma-perf: introduce dma-perf
> > > > application
> > > >
> > > > External Email
> > > >
> > > > ------------------------------------------------------------------
> > > > --
> > > > -- There are many high-performance DMA devices supported in DPDK
> > > > now,
> > > and
> > > > these DMA devices can also be integrated into other modules of
> > > > DPDK as accelerators, such as Vhost. Before integrating DMA into
> > > > applications, developers need to know the performance of these DMA
> > > > devices in various scenarios and the performance of CPUs in the
> > > > same scenario, such as different buffer lengths. Only in this way
> > > > can we know the target performance of the application accelerated
> > > > by using them. This patch introduces a high-performance testing
> > > > tool, which supports comparing the performance of CPU and DMA in
> > > > different scenarios automatically with a pre- set config file.
> > > > Memory Copy performance test
> > > are supported for now.
> > > >
> > > > Signed-off-by: Cheng Jiang <cheng1.jiang@intel.com>
> > > > Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> > > > Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> > > > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > > > Acked-by: Chenbo Xia <chenbo.xia@intel.com>
> > > > ---
> > > > v5:
> > > >   fixed some LONG_LINE warnings;
> > > > v4:
> > > >   fixed inaccuracy of the memory footprint display;
> > > > v3:
> > > >   fixed some typos;
> > > > v2:
> > > >   added lcore/dmadev designation;
> > > >   added error case process;
> > > >   removed worker_threads parameter from config.ini;
> > > >   improved the logs;
> > > >   improved config file;
> > > >
> > > >  app/meson.build               |   1 +
> > > >  app/test-dma-perf/benchmark.c | 472
> ++++++++++++++++++++++++++++
> > > > app/test-dma-perf/config.ini  |  59 ++++
> > > >  app/test-dma-perf/main.c      | 569
> > > > ++++++++++++++++++++++++++++++++++
> > > >  app/test-dma-perf/main.h      |  69 +++++
> > > >  app/test-dma-perf/meson.build |  17 +
> > > >  6 files changed, 1187 insertions(+)  create mode 100644
> > > > app/test-dma-perf/benchmark.c  create mode
> > > 100644
> > > > app/test-dma-perf/config.ini  create mode 100644 app/test-dma-
> > > > perf/main.c  create mode 100644 app/test-dma-perf/main.h  create
> > > > mode
> > > > 100644 app/test-dma-perf/meson.build
> > > >
> > >
> > > <snip>
> > >
> > > > +
> > > > +static inline int
> > > > +do_dma_mem_copy(void *p)
> > > > +{
> > > > +	uint16_t *para_idx = (uint16_t *)p;
> > > > +	volatile struct lcore_params *para = worker_params[*para_idx];
> > > > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > > > +	uint16_t dev_id = para->dev_id;
> > > > +	uint32_t nr_buf = para->nr_buf;
> > > > +	uint16_t kick_batch = para->kick_batch;
> > > > +	uint32_t buf_size = para->buf_size;
> > > > +	struct rte_mbuf **srcs = para->srcs;
> > > > +	struct rte_mbuf **dsts = para->dsts;
> > > > +	int64_t async_cnt = 0;
> > > > +	int nr_cpl = 0;
> > > > +	uint32_t i;
> > > > +	uint32_t poll_cnt = 0;
> > > > +
> > > > +	worker_info->stop_flag = false;
> > > > +	worker_info->ready_flag = true;
> > > > +
> > > > +	while (!worker_info->start_flag)
> > > > +		;
> > > > +
> > > > +	while (1) {
> > > > +		for (i = 0; i < nr_buf; i++) {
> > > > +			if (unlikely(rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						rte_pktmbuf_iova(srcs[i]),
> > > > +						rte_pktmbuf_iova(dsts[i]),
> > > > +						buf_size,
> > > > +						0) < 0)) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				while (rte_dma_burst_capacity(dev_id, 0) ==
> > > > 0) {
> > > > +					nr_cpl = rte_dma_completed(dev_id,
> > > > 0, MAX_DMA_CPL_NB,
> > > > +								NULL, NULL);
> > > > +					async_cnt -= nr_cpl;
> > > > +					worker_info->total_cpl += nr_cpl;
> > > > +				}
> > > > +				if (rte_dma_copy(dev_id,
> > > > +						0,
> > > > +						rte_pktmbuf_iova(srcs[i]),
> > > > +						rte_pktmbuf_iova(dsts[i]),
> > > > +						buf_size,
> > > > +						0) < 0) {
> > > > +					printf("enqueue fail again at %u\n",
> > > > i);
> > > > +					printf("space:%d\n",
> > > > rte_dma_burst_capacity(dev_id, 0));
> > > > +					rte_exit(EXIT_FAILURE, "DMA
> > > > enqueue failed\n");
> > >
> > > [Amit]: On all success or failure exits, please call rte_dma_stop
> > > and rte_dma_close to exit cleanly.
> >
> > [Cheng] Got it. Thanks, I'll fix it in the next version.
> 
> [Cheng] Hi, I take a look into it, and find out rte_exit() will call
> rte_dma_close(), so I think there is no need to do it explicitly here. What do
> you think?

[Amit]: I agree, not required to explicitly call rte_dma_close here.

> 
> >
> > >
> > > > +				}
> > > > +			}
> > > > +			async_cnt++;
> > > > +
> > > > +			if ((async_cnt % kick_batch) == 0) {
> > > > +				rte_dma_submit(dev_id, 0);
> > > > +				/* add a poll to avoid ring full */
> > > > +				nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +				async_cnt -= nr_cpl;
> > > > +				worker_info->total_cpl += nr_cpl;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		if (worker_info->stop_flag)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	rte_dma_submit(dev_id, 0);
> > > > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > > > +		nr_cpl = rte_dma_completed(dev_id, 0,
> > > > MAX_DMA_CPL_NB, NULL, NULL);
> > > > +		async_cnt -= nr_cpl;
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > >
> > > <snip>
> > >
> > > > +
> > > > +void
> > > > +mem_copy_benchmark(struct test_configure *cfg, bool is_dma) {
> > > > +	uint16_t i;
> > > > +	uint32_t offset;
> > > > +	unsigned int lcore_id = 0;
> > > > +	struct rte_mbuf **srcs = NULL, **dsts = NULL;
> > > > +	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > > > +	unsigned int buf_size = cfg->buf_size.cur;
> > > > +	uint16_t kick_batch = cfg->kick_batch.cur;
> > > > +	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 *
> > > > +1024) /
> > > > (cfg->buf_size.cur * 2);
> > > > +	uint16_t nb_workers = ldm->cnt;
> > > > +	uint16_t test_secs = cfg->test_secs;
> > > > +	float memory;
> > > > +	uint32_t avg_cycles = 0;
> > > > +	float mops;
> > > > +	float bandwidth;
> > > > +
> > > > +	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > > > +		goto out;
> > > > +
> > > > +	if (is_dma)
> > > > +		if (config_dmadevs(cfg) < 0)
> > > > +			goto out;
> > > > +
> > > > +	if (cfg->cache_flush) {
> > > > +		cache_flush_buf(srcs, buf_size, nr_buf);
> > > > +		cache_flush_buf(dsts, buf_size, nr_buf);
> > > > +		rte_mb();
> > > > +	}
> > > > +
> > > > +	printf("Start testing....\n");
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		lcore_id = ldm->lcores[i];
> > > > +		offset = nr_buf / nb_workers * i;
> > > > +
> > > > +		worker_params[i] = rte_malloc(NULL, sizeof(struct
> > > > lcore_params), 0);
> > > > +		if (!worker_params[i]) {
> > > > +			printf("lcore parameters malloc failure for lcore
> > > > %d\n", lcore_id);
> > > > +			break;
> > > > +		}
> > > > +		if (is_dma) {
> > > > +			worker_params[i]->dma_name = ldm-
> > > > >dma_names[i];
> > > > +			worker_params[i]->dev_id = ldm->dma_ids[i];
> > > > +			worker_params[i]->kick_batch = kick_batch;
> > > > +		}
> > > > +		worker_params[i]->worker_id = i;
> > > > +		worker_params[i]->nr_buf = (uint32_t)(nr_buf /
> > > > nb_workers);
> > > > +		worker_params[i]->buf_size = buf_size;
> > > > +		worker_params[i]->test_secs = test_secs;
> > > > +		worker_params[i]->srcs = srcs + offset;
> > > > +		worker_params[i]->dsts = dsts + offset;
> > > > +		worker_params[i]->scenario_id = cfg->scenario_id;
> > > > +		worker_params[i]->lcore_id = lcore_id;
> > > > +
> > > > +		if (is_dma)
> > > > +			rte_eal_remote_launch(do_dma_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +		else
> > > > +			rte_eal_remote_launch(do_cpu_mem_copy, (void
> > > > *)(&i), lcore_id);
> > > > +	}
> > > > +
> > > > +	while (1) {
> > > > +		bool ready = true;
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			if (worker_params[i]->worker_info.ready_flag ==
> > > > false) {
> > > > +				ready = 0;
> > > > +				break;
> > > > +			}
> > > > +		}
> > > > +		if (ready)
> > > > +			break;
> > > > +	}
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.start_flag = true;
> > > > +
> > > > +	usleep(TEST_WAIT_U_SECOND);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > +worker_params[i]->worker_info.total_cpl;
> > > > +
> > > > +	usleep(test_secs * 1000 * 1000);
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.test_cpl =
> > > > worker_params[i]->worker_info.total_cpl -
> > > > +						worker_params[i]-
> > > > >worker_info.test_cpl;
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++)
> > > > +		worker_params[i]->worker_info.stop_flag = true;
> > > > +
> > > > +	rte_eal_mp_wait_lcore();
> > > > +
> > > > +	for (i = 0; i < nb_workers; i++) {
> > > > +		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> > > > +			worker_params[i]->worker_info.test_cpl,
> > > > +			&memory, &avg_cycles, &bandwidth, &mops);
> > > > +		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
> > > > +					worker_params[i]->dma_name,
> > > > avg_cycles, buf_size,
> > > > +					nr_buf / nb_workers, memory,
> > > > bandwidth, mops, is_dma);
> > > > +	}
> > > > +
> > > > +out:
> > > > +	/* free env */
> > > > +	if (srcs)
> > > > +		rte_pktmbuf_free_bulk(srcs, nr_buf);
> > > > +	if (dsts)
> > > > +		rte_pktmbuf_free_bulk(dsts, nr_buf);
> > > > +
> > > > +	if (src_pool)
> > > > +		rte_mempool_free(src_pool);
> > > > +	if (dst_pool)
> > > > +		rte_mempool_free(dst_pool);
> > > > +
> > > > +	if (is_dma) {
> > > > +		for (i = 0; i < nb_workers; i++) {
> > > > +			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> > > > +			rte_dma_stop(ldm->dma_ids[i]);
> > >
> > > [Amit]: Below rte_dma_stop please call rte_dma_close for clean exit.
> > >
> >
> > [Cheng] Sure, I'll fix it in the next version.
> 
> [Cheng] We are not planning to exit here, we are just going to do the next
> test, so we shouldn't call the rte_dma_close(). Just clarify. And when we
> finished all the test, we will call the rte_eal_cleanup() which will call the
> rte_dma_close(). Thanks!

[Amit]: Sure, thanks.

> 
> >
> > > <snip>
> > >
> > > > +#endif /* _MAIN_H_ */
> > > > diff --git a/app/test-dma-perf/meson.build b/app/test-dma-
> > > > perf/meson.build new file mode 100644 index 0000000000..bd6c264002
> > > > --- /dev/null
> > > > +++ b/app/test-dma-perf/meson.build
> > > > @@ -0,0 +1,17 @@
> > > > +# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2019-2023
> > > > +Intel Corporation
> > > > +
> > > > +# meson file, for building this app as part of a main DPDK build.
> > > > +
> > > > +if is_windows
> > > > +    build = false
> > > > +    reason = 'not supported on Windows'
> > > > +    subdir_done()
> > > > +endif
> > > > +
> > > > +deps += ['dmadev', 'mbuf', 'cfgfile']
> > > > +
> > > > +sources = files(
> > > > +        'main.c',
> > > > +        'benchmark.c',
> > > > +)
> > > > --
> > > > 2.40.1
  

Patch

diff --git a/app/meson.build b/app/meson.build
index 74d2420f67..4fc1a83eba 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -19,6 +19,7 @@  apps = [
         'test-cmdline',
         'test-compress-perf',
         'test-crypto-perf',
+        'test-dma-perf',
         'test-eventdev',
         'test-fib',
         'test-flow-perf',
diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
new file mode 100644
index 0000000000..39d7b26955
--- /dev/null
+++ b/app/test-dma-perf/benchmark.c
@@ -0,0 +1,472 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <rte_time.h>
+#include <rte_mbuf.h>
+#include <rte_dmadev.h>
+#include <rte_malloc.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define MAX_DMA_CPL_NB 255
+
+#define TEST_WAIT_U_SECOND 10000
+
+#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
+
+struct worker_info {
+	bool ready_flag;
+	bool start_flag;
+	bool stop_flag;
+	uint32_t total_cpl;
+	uint32_t test_cpl;
+};
+
+struct lcore_params {
+	uint8_t scenario_id;
+	unsigned int lcore_id;
+	char *dma_name;
+	uint16_t worker_id;
+	uint16_t dev_id;
+	uint32_t nr_buf;
+	uint16_t kick_batch;
+	uint32_t buf_size;
+	uint16_t test_secs;
+	struct rte_mbuf **srcs;
+	struct rte_mbuf **dsts;
+	struct worker_info worker_info;
+};
+
+static struct rte_mempool *src_pool;
+static struct rte_mempool *dst_pool;
+
+static volatile struct lcore_params *worker_params[MAX_WORKER_NB];
+
+#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
+
+static inline int
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+	int ret;
+
+	ret = fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	ret += vfprintf(stderr, format, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static inline void
+calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
+				uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
+				float *bandwidth, float *mops)
+{
+	*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
+	*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
+	*bandwidth = (buf_size * 8 * (rte_get_timer_hz() / (float)*ave_cycle)) / 1000000000;
+	*mops = (float)rte_get_timer_hz() / *ave_cycle / 1000000;
+}
+
+static void
+output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint64_t ave_cycle,
+			uint32_t buf_size, uint32_t nr_buf, float memory,
+			float bandwidth, float mops, bool is_dma)
+{
+	if (is_dma)
+		printf("lcore %u, DMA %s:\n", lcore_id, dma_name);
+	else
+		printf("lcore %u\n", lcore_id);
+
+	printf("average cycles/op: %" PRIu64 ", buffer size: %u, nr_buf: %u, memory: %.2lfMB, frequency: %" PRIu64 ".\n",
+			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz());
+	printf("Average bandwidth: %.3lfGbps, MOps: %.3lf\n", bandwidth, mops);
+
+	if (is_dma)
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
+			scenario_id, lcore_id, dma_name, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+	else
+		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
+			scenario_id, lcore_id, buf_size,
+			nr_buf, memory, ave_cycle, bandwidth, mops);
+}
+
+static inline void
+cache_flush_buf(__maybe_unused struct rte_mbuf **array,
+		__maybe_unused uint32_t buf_size,
+		__maybe_unused uint32_t nr_buf)
+{
+#ifdef RTE_ARCH_X86_64
+	char *data;
+	struct rte_mbuf **srcs = array;
+	uint32_t i, offset;
+
+	for (i = 0; i < nr_buf; i++) {
+		data = rte_pktmbuf_mtod(srcs[i], char *);
+		for (offset = 0; offset < buf_size; offset += 64)
+			__builtin_ia32_clflush(data + offset);
+	}
+#endif
+}
+
+/* Configuration of device. */
+static void
+configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
+{
+	uint16_t vchan = 0;
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
+		rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
+
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1)
+		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
+				dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
+}
+
+static int
+config_dmadevs(struct test_configure *cfg)
+{
+	uint32_t ring_size = cfg->ring_size.cur;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	uint32_t nb_workers = ldm->cnt;
+	uint32_t i;
+	int dev_id;
+	uint16_t nb_dmadevs = 0;
+	char *dma_name;
+
+	for (i = 0; i < ldm->cnt; i++) {
+		dma_name = ldm->dma_names[i];
+		dev_id = rte_dma_get_dev_id_by_name(dma_name);
+		if (dev_id == -1) {
+			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
+			goto end;
+		}
+
+		ldm->dma_ids[i] = dev_id;
+		configure_dmadev_queue(dev_id, ring_size);
+		++nb_dmadevs;
+	}
+
+end:
+	if (nb_dmadevs < nb_workers) {
+		printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
+		return -1;
+	}
+
+	printf("Number of used dmadevs: %u.\n", nb_dmadevs);
+
+	return 0;
+}
+
+#define POLL_MAX 1000
+
+static inline int
+do_dma_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint16_t kick_batch = para->kick_batch;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	int64_t async_cnt = 0;
+	int nr_cpl = 0;
+	uint32_t i;
+	uint32_t poll_cnt = 0;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			if (unlikely(rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0)) {
+				rte_dma_submit(dev_id, 0);
+				while (rte_dma_burst_capacity(dev_id, 0) == 0) {
+					nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
+								NULL, NULL);
+					async_cnt -= nr_cpl;
+					worker_info->total_cpl += nr_cpl;
+				}
+				if (rte_dma_copy(dev_id,
+						0,
+						rte_pktmbuf_iova(srcs[i]),
+						rte_pktmbuf_iova(dsts[i]),
+						buf_size,
+						0) < 0) {
+					printf("enqueue fail again at %u\n", i);
+					printf("space:%d\n", rte_dma_burst_capacity(dev_id, 0));
+					rte_exit(EXIT_FAILURE, "DMA enqueue failed\n");
+				}
+			}
+			async_cnt++;
+
+			if ((async_cnt % kick_batch) == 0) {
+				rte_dma_submit(dev_id, 0);
+				/* add a poll to avoid ring full */
+				nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+				async_cnt -= nr_cpl;
+				worker_info->total_cpl += nr_cpl;
+			}
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
+static inline int
+do_cpu_mem_copy(void *p)
+{
+	uint16_t *para_idx = (uint16_t *)p;
+	volatile struct lcore_params *para = worker_params[*para_idx];
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	uint32_t nr_buf = para->nr_buf;
+	uint32_t buf_size = para->buf_size;
+	struct rte_mbuf **srcs = para->srcs;
+	struct rte_mbuf **dsts = para->dsts;
+	uint32_t i;
+
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		for (i = 0; i < nr_buf; i++) {
+			/* copy buffer form src to dst */
+			rte_memcpy((void *)(uintptr_t)rte_mbuf_data_iova(dsts[i]),
+				(void *)(uintptr_t)rte_mbuf_data_iova(srcs[i]),
+				(size_t)buf_size);
+			worker_info->total_cpl++;
+		}
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	return 0;
+}
+
+static int
+setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
+			struct rte_mbuf ***dsts)
+{
+	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int nr_sockets;
+	uint32_t nr_buf = cfg->nr_buf;
+
+	nr_sockets = rte_socket_count();
+	if (cfg->src_numa_node >= nr_sockets ||
+		cfg->dst_numa_node >= nr_sockets) {
+		printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
+		return -1;
+	}
+
+	src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->src_numa_node);
+	if (src_pool == NULL) {
+		PRINT_ERR("Error with source mempool creation.\n");
+		return -1;
+	}
+
+	dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
+			nr_buf, /* n == num elements */
+			64,  /* cache size */
+			0,   /* priv size */
+			buf_size + RTE_PKTMBUF_HEADROOM,
+			cfg->dst_numa_node);
+	if (dst_pool == NULL) {
+		PRINT_ERR("Error with destination mempool creation.\n");
+		return -1;
+	}
+
+	*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*srcs == NULL) {
+		printf("Error: srcs malloc failed.\n");
+		return -1;
+	}
+
+	*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
+	if (*dsts == NULL) {
+		printf("Error: dsts malloc failed.\n");
+		return -1;
+	}
+
+	if (rte_mempool_get_bulk(src_pool, (void **)*srcs, nr_buf) != 0) {
+		printf("get src mbufs failed.\n");
+		return -1;
+	}
+	if (rte_mempool_get_bulk(dst_pool, (void **)*dsts, nr_buf) != 0) {
+		printf("get dst mbufs failed.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+{
+	uint16_t i;
+	uint32_t offset;
+	unsigned int lcore_id = 0;
+	struct rte_mbuf **srcs = NULL, **dsts = NULL;
+	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	unsigned int buf_size = cfg->buf_size.cur;
+	uint16_t kick_batch = cfg->kick_batch.cur;
+	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	uint16_t nb_workers = ldm->cnt;
+	uint16_t test_secs = cfg->test_secs;
+	float memory;
+	uint32_t avg_cycles = 0;
+	float mops;
+	float bandwidth;
+
+	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+		goto out;
+
+	if (is_dma)
+		if (config_dmadevs(cfg) < 0)
+			goto out;
+
+	if (cfg->cache_flush) {
+		cache_flush_buf(srcs, buf_size, nr_buf);
+		cache_flush_buf(dsts, buf_size, nr_buf);
+		rte_mb();
+	}
+
+	printf("Start testing....\n");
+
+	for (i = 0; i < nb_workers; i++) {
+		lcore_id = ldm->lcores[i];
+		offset = nr_buf / nb_workers * i;
+
+		worker_params[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
+		if (!worker_params[i]) {
+			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
+			break;
+		}
+		if (is_dma) {
+			worker_params[i]->dma_name = ldm->dma_names[i];
+			worker_params[i]->dev_id = ldm->dma_ids[i];
+			worker_params[i]->kick_batch = kick_batch;
+		}
+		worker_params[i]->worker_id = i;
+		worker_params[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
+		worker_params[i]->buf_size = buf_size;
+		worker_params[i]->test_secs = test_secs;
+		worker_params[i]->srcs = srcs + offset;
+		worker_params[i]->dsts = dsts + offset;
+		worker_params[i]->scenario_id = cfg->scenario_id;
+		worker_params[i]->lcore_id = lcore_id;
+
+		if (is_dma)
+			rte_eal_remote_launch(do_dma_mem_copy, (void *)(&i), lcore_id);
+		else
+			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(&i), lcore_id);
+	}
+
+	while (1) {
+		bool ready = true;
+		for (i = 0; i < nb_workers; i++) {
+			if (worker_params[i]->worker_info.ready_flag == false) {
+				ready = 0;
+				break;
+			}
+		}
+		if (ready)
+			break;
+	}
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.start_flag = true;
+
+	usleep(TEST_WAIT_U_SECOND);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl;
+
+	usleep(test_secs * 1000 * 1000);
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.test_cpl = worker_params[i]->worker_info.total_cpl -
+						worker_params[i]->worker_info.test_cpl;
+
+	for (i = 0; i < nb_workers; i++)
+		worker_params[i]->worker_info.stop_flag = true;
+
+	rte_eal_mp_wait_lcore();
+
+	for (i = 0; i < nb_workers; i++) {
+		calc_result(buf_size, nr_buf, nb_workers, test_secs,
+			worker_params[i]->worker_info.test_cpl,
+			&memory, &avg_cycles, &bandwidth, &mops);
+		output_result(cfg->scenario_id, worker_params[i]->lcore_id,
+					worker_params[i]->dma_name, avg_cycles, buf_size,
+					nr_buf / nb_workers, memory, bandwidth, mops, is_dma);
+	}
+
+out:
+	/* free env */
+	if (srcs)
+		rte_pktmbuf_free_bulk(srcs, nr_buf);
+	if (dsts)
+		rte_pktmbuf_free_bulk(dsts, nr_buf);
+
+	if (src_pool)
+		rte_mempool_free(src_pool);
+	if (dst_pool)
+		rte_mempool_free(dst_pool);
+
+	if (is_dma) {
+		for (i = 0; i < nb_workers; i++) {
+			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
+			rte_dma_stop(ldm->dma_ids[i]);
+		}
+	}
+}
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
new file mode 100644
index 0000000000..0318da305b
--- /dev/null
+++ b/app/test-dma-perf/config.ini
@@ -0,0 +1,59 @@ 
+
+; This is an example configuration file for dma-perf, which details the meanings of each parameter
+; and instructions on how to use dma-perf.
+
+; Supported test types are DMA_MEM_COPY and CPU_MEM_COPY.
+
+; Parameters:
+; "mem_size" denotes the size of the memory footprint.
+; "buf_size" denotes the memory size of a single operation.
+; "dma_ring_size" denotes the dma ring buffer size. It should be greater than 64 normally.
+; "kick_batch" denotes the dma operation batch size, and should be greater than 1 normally.
+
+; The format for variables is variable=first[,last,increment[,ADD|MUL]]. ADD is the default mode.
+
+; src_numa_node is used to control the numa node where the source memory is allocated.
+; dst_numa_node is used to control the numa node where the destination memory is allocated.
+
+; cache_flush is used to determine whether or not the cache should be flushed, with 1 indicating to
+; flush and 0 indicating to not flush.
+
+; test_seconds controls the test time of the whole case.
+
+; To use DMA for a test, please specify the "lcore_dma" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore_dma" falls within their range of the values.
+
+; To use CPU for a test, please specify the "lcore" parameter.
+; If you have already set the "-l" and "-a" parameters using EAL,
+; make sure that the value of "lcore" falls within their range of values.
+
+; To specify a configuration file, use the "--config" flag followed by the path to the file.
+
+; To specify a result file, use the "--result" flag followed by the path to the file.
+; If you do not specify a result file, one will be generated with the same name as the configuration
+; file, with the addition of "_result.csv" at the end.
+
+[case1]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case2]
+type=CPU_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+src_numa_node=0
+dst_numa_node=1
+cache_flush=0
+test_seconds=2
+lcore = 3, 4
+eal_args=--in-memory --no-pci
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
new file mode 100644
index 0000000000..9c2b55dec4
--- /dev/null
+++ b/app/test-dma-perf/main.c
@@ -0,0 +1,569 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <libgen.h>
+
+#include <rte_eal.h>
+#include <rte_cfgfile.h>
+#include <rte_string_fns.h>
+#include <rte_lcore.h>
+
+#include "main.h"
+
+#define CSV_HDR_FMT "Case %u : %s,lcore,DMA,buffer size,nr_buf,memory(MB),cycle,bandwidth(Gbps),MOps\n"
+
+#define MAX_EAL_PARAM_NB 100
+#define MAX_EAL_PARAM_LEN 1024
+
+#define DMA_MEM_COPY "DMA_MEM_COPY"
+#define CPU_MEM_COPY "CPU_MEM_COPY"
+
+#define CMDLINE_CONFIG_ARG "--config"
+#define CMDLINE_RESULT_ARG "--result"
+
+#define MAX_PARAMS_PER_ENTRY 4
+
+#define MAX_LONG_OPT_SZ 64
+
+enum {
+	TEST_TYPE_NONE = 0,
+	TEST_TYPE_DMA_MEM_COPY,
+	TEST_TYPE_CPU_MEM_COPY
+};
+
+#define MAX_TEST_CASES 16
+static struct test_configure test_cases[MAX_TEST_CASES];
+
+char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+static FILE *fd;
+
+static void
+output_csv(bool need_blankline)
+{
+	uint32_t i;
+
+	if (need_blankline) {
+		fprintf(fd, ",,,,,,,,\n");
+		fprintf(fd, ",,,,,,,,\n");
+	}
+
+	for (i = 0; i < RTE_DIM(output_str); i++) {
+		if (output_str[i][0]) {
+			fprintf(fd, "%s", output_str[i]);
+			output_str[i][0] = '\0';
+		}
+	}
+
+	fflush(fd);
+}
+
+static void
+output_env_info(void)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN, "test environment:\n");
+	snprintf(output_str[1], MAX_OUTPUT_STR_LEN, "CPU frequency,%"
+			PRIu64 "\n", rte_get_timer_hz());
+
+	output_csv(true);
+}
+
+static void
+output_header(uint32_t case_id, struct test_configure *case_cfg)
+{
+	snprintf(output_str[0], MAX_OUTPUT_STR_LEN,
+			CSV_HDR_FMT, case_id, case_cfg->test_type_str);
+
+	output_csv(true);
+}
+
+static void
+run_test_case(struct test_configure *case_cfg)
+{
+	switch (case_cfg->test_type) {
+	case TEST_TYPE_DMA_MEM_COPY:
+		mem_copy_benchmark(case_cfg, true);
+		break;
+	case TEST_TYPE_CPU_MEM_COPY:
+		mem_copy_benchmark(case_cfg, false);
+		break;
+	default:
+		printf("Unknown test type. %s\n", case_cfg->test_type_str);
+		break;
+	}
+}
+
+static void
+run_test(uint32_t case_id, struct test_configure *case_cfg)
+{
+	uint32_t i;
+	uint32_t nb_lcores = rte_lcore_count();
+	struct test_configure_entry *mem_size = &case_cfg->mem_size;
+	struct test_configure_entry *buf_size = &case_cfg->buf_size;
+	struct test_configure_entry *ring_size = &case_cfg->ring_size;
+	struct test_configure_entry *kick_batch = &case_cfg->kick_batch;
+	struct test_configure_entry dummy = { 0 };
+	struct test_configure_entry *var_entry = &dummy;
+
+
+
+	for (i = 0; i < RTE_DIM(output_str); i++)
+		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
+
+	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+		printf("Case %u: Not enough lcores.\n", case_id);
+		return;
+	}
+
+	printf("Number of used lcores: %u.\n", nb_lcores);
+
+	if (mem_size->incr != 0)
+		var_entry = mem_size;
+
+	if (buf_size->incr != 0)
+		var_entry = buf_size;
+
+	if (ring_size->incr != 0)
+		var_entry = ring_size;
+
+	if (kick_batch->incr != 0)
+		var_entry = kick_batch;
+
+	case_cfg->scenario_id = 0;
+
+	output_header(case_id, case_cfg);
+
+	for (var_entry->cur = var_entry->first; var_entry->cur <= var_entry->last;) {
+		case_cfg->scenario_id++;
+		printf("\nRunning scenario %d\n", case_cfg->scenario_id);
+
+		run_test_case(case_cfg);
+		output_csv(false);
+
+		if (var_entry->op == OP_MUL)
+			var_entry->cur *= var_entry->incr;
+		else if (var_entry->op == OP_ADD)
+			var_entry->cur += var_entry->incr;
+		else
+			break;
+	}
+
+}
+
+static int
+parse_lcore(struct test_configure *test_case, const char *value)
+{
+	size_t len = strlen(value);
+	char *input = (char *) malloc((len + 1) * sizeof(char));
+	strcpy(input, value);
+	struct lcore_dma_map_t *lcore_dma_map = &(test_case->lcore_dma_map);
+
+	if (test_case == NULL || value == NULL)
+		return -1;
+
+	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	char *token = strtok(input, ", ");
+	while (token != NULL) {
+		if (lcore_dma_map->cnt >= MAX_LCORE_NB) {
+			free(input);
+			return -1;
+		}
+
+		uint16_t lcore_id = atoi(token);
+		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+
+		token = strtok(NULL, ", ");
+	}
+
+	free(input);
+	return 0;
+}
+
+static int
+parse_lcore_dma(struct test_configure *test_case, const char *value)
+{
+	struct lcore_dma_map_t *lcore_dma_map;
+	char *input = strndup(value, strlen(value) + 1);
+	char *addrs = input;
+	char *ptrs[2];
+	char *start, *end, *substr;
+	uint16_t lcore_id;
+	int ret = 0;
+
+	while (*addrs == '\0')
+		addrs++;
+	if (*addrs == '\0') {
+		fprintf(stderr, "No input DMA addresses\n");
+		ret = -1;
+		goto out;
+	}
+
+	substr = strtok(addrs, ",");
+	if (substr == NULL) {
+		fprintf(stderr, "No input DMA address\n");
+		ret = -1;
+		goto out;
+	}
+
+	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+
+	do {
+		rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
+
+		start = strstr(ptrs[0], "lcore");
+		if (start == NULL) {
+			fprintf(stderr, "Illegal lcore\n");
+			ret = -1;
+			break;
+		}
+
+		start += 5;
+		lcore_id = strtol(start, &end, 0);
+		if (end == start) {
+			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
+			ret = -1;
+			break;
+		}
+
+		lcore_dma_map = &test_case->lcore_dma_map;
+		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
+		strcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1]);
+		lcore_dma_map->cnt++;
+		substr = strtok(NULL, ",");
+	} while (substr != NULL);
+
+out:
+	free(input);
+	return ret;
+}
+
+static int
+parse_entry(const char *value, struct test_configure_entry *entry)
+{
+	char input[255] = {0};
+	char *args[MAX_PARAMS_PER_ENTRY];
+	int args_nr = -1;
+
+	if (value == NULL || entry == NULL)
+		goto out;
+
+	strncpy(input, value, 254);
+	if (*input == '\0')
+		goto out;
+
+	args_nr = rte_strsplit(input, strlen(input), args, MAX_PARAMS_PER_ENTRY, ',');
+	if (args_nr <= 0)
+		goto out;
+
+	entry->cur = entry->first = (uint32_t)atoi(args[0]);
+	entry->last = args_nr > 1 ? (uint32_t)atoi(args[1]) : 0;
+	entry->incr = args_nr > 2 ? (uint32_t)atoi(args[2]) : 0;
+
+	if (args_nr > 3) {
+		if (!strcmp(args[3], "MUL"))
+			entry->op = OP_MUL;
+		else if (!strcmp(args[3], "ADD"))
+			entry->op = OP_ADD;
+		else {
+			printf("Invalid op %s", args[3]);
+			args_nr = -1;
+		}
+	} else
+		entry->op = OP_NONE;
+out:
+	return args_nr;
+}
+
+static uint16_t
+load_configs(const char *path)
+{
+	struct rte_cfgfile *cfgfile;
+	int nb_sections, i;
+	struct test_configure *test_case;
+	char section_name[CFG_NAME_LEN];
+	const char *case_type;
+	const char *lcore_dma;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	int args_nr, nb_vp;
+	bool is_dma;
+
+	printf("config file parsing...\n");
+	cfgfile = rte_cfgfile_load(path, 0);
+	if (!cfgfile) {
+		printf("Open configure file error.\n");
+		exit(1);
+	}
+
+	nb_sections = rte_cfgfile_num_sections(cfgfile, NULL, 0);
+	if (nb_sections > MAX_TEST_CASES) {
+		printf("Error: The maximum number of cases is %d.\n", MAX_TEST_CASES);
+		exit(1);
+	}
+
+	for (i = 0; i < nb_sections; i++) {
+		snprintf(section_name, CFG_NAME_LEN, "case%d", i + 1);
+		test_case = &test_cases[i];
+		case_type = rte_cfgfile_get_entry(cfgfile, section_name, "type");
+		if (!case_type) {
+			printf("Error: No case type in case %d, the test will be finished here.\n",
+				i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
+			test_case->test_type_str = DMA_MEM_COPY;
+			is_dma = true;
+		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
+			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
+			test_case->test_type_str = CPU_MEM_COPY;
+			is_dma = false;
+		} else {
+			printf("Error: Cannot find case type %s in case%d.\n", case_type, i + 1);
+			test_case->is_valid = false;
+			continue;
+		}
+
+		nb_vp = 0;
+
+		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "src_numa_node"));
+		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dst_numa_node"));
+
+		mem_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "mem_size");
+		args_nr = parse_entry(mem_size_str, &test_case->mem_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		buf_size_str = rte_cfgfile_get_entry(cfgfile, section_name, "buf_size");
+		args_nr = parse_entry(buf_size_str, &test_case->buf_size);
+		if (args_nr < 0) {
+			printf("parse error in case %d.\n", i + 1);
+			test_case->is_valid = false;
+			continue;
+		} else if (args_nr > 1)
+			nb_vp++;
+
+		if (is_dma) {
+			ring_size_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_ring_size");
+			args_nr = parse_entry(ring_size_str, &test_case->ring_size);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
+			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
+			if (args_nr < 0) {
+				printf("parse error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (args_nr > 1)
+				nb_vp++;
+
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
+			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore dma error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		} else {
+			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore");
+			int lcore_ret = parse_lcore(test_case, lcore_dma);
+			if (lcore_ret < 0) {
+				printf("parse lcore error in case %d.\n", i + 1);
+				test_case->is_valid = false;
+				continue;
+			}
+		}
+
+		if (nb_vp > 1) {
+			printf("Error, each section can only have a single variable parameter.\n");
+			test_case->is_valid = false;
+			continue;
+		}
+
+		test_case->cache_flush =
+			(int)atoi(rte_cfgfile_get_entry(cfgfile, section_name, "cache_flush"));
+		test_case->test_secs = (uint16_t)atoi(rte_cfgfile_get_entry(cfgfile,
+					section_name, "test_seconds"));
+
+		test_case->eal_args = rte_cfgfile_get_entry(cfgfile, section_name, "eal_args");
+		test_case->is_valid = true;
+	}
+
+	rte_cfgfile_close(cfgfile);
+	printf("config file parsing complete.\n\n");
+	return i;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+append_eal_args(int argc, char **argv, const char *eal_args, char **new_argv)
+{
+	int i;
+	char *tokens[MAX_EAL_PARAM_NB];
+	char args[MAX_EAL_PARAM_LEN] = {0};
+	int token_nb, new_argc = 0;
+
+	for (i = 0; i < argc; i++) {
+		if ((strcmp(argv[i], CMDLINE_CONFIG_ARG) == 0) ||
+				(strcmp(argv[i], CMDLINE_RESULT_ARG) == 0)) {
+			i++;
+			continue;
+		}
+		strlcpy(new_argv[new_argc], argv[i], sizeof(new_argv[new_argc]));
+		new_argc++;
+	}
+
+	if (eal_args) {
+		strlcpy(args, eal_args, sizeof(args));
+		token_nb = rte_strsplit(args, strlen(args),
+					tokens, MAX_EAL_PARAM_NB, ' ');
+		for (i = 0; i < token_nb; i++)
+			strcpy(new_argv[new_argc++], tokens[i]);
+	}
+
+	return new_argc;
+}
+
+int
+main(int argc, char *argv[])
+{
+	int ret;
+	uint16_t case_nb;
+	uint32_t i, nb_lcores;
+	pid_t cpid, wpid;
+	int wstatus;
+	char args[MAX_EAL_PARAM_NB][MAX_EAL_PARAM_LEN];
+	char *pargs[MAX_EAL_PARAM_NB];
+	char *cfg_path_ptr = NULL;
+	char *rst_path_ptr = NULL;
+	char rst_path[PATH_MAX];
+	int new_argc;
+	bool is_first_case = true;
+
+	memset(args, 0, sizeof(args));
+
+	for (i = 0; i < RTE_DIM(pargs); i++)
+		pargs[i] = args[i];
+
+	for (i = 0; i < (uint32_t)argc; i++) {
+		if (strncmp(argv[i], CMDLINE_CONFIG_ARG, MAX_LONG_OPT_SZ) == 0)
+			cfg_path_ptr = argv[i + 1];
+		if (strncmp(argv[i], CMDLINE_RESULT_ARG, MAX_LONG_OPT_SZ) == 0)
+			rst_path_ptr = argv[i + 1];
+	}
+	if (cfg_path_ptr == NULL) {
+		printf("Config file not assigned.\n");
+		return -1;
+	}
+	if (rst_path_ptr == NULL) {
+		strlcpy(rst_path, cfg_path_ptr, PATH_MAX);
+		strcat(strtok(basename(rst_path), "."), "_result.csv");
+		rst_path_ptr = rst_path;
+	}
+
+	case_nb = load_configs(cfg_path_ptr);
+	fd = fopen(rst_path_ptr, "w");
+	if (fd == NULL) {
+		printf("Open output CSV file error.\n");
+		return -1;
+	}
+	fclose(fd);
+
+	for (i = 0; i < case_nb; i++) {
+		if (test_cases[i].test_type == TEST_TYPE_NONE) {
+			printf("No test type in test case %d.\n\n", i + 1);
+			continue;
+		}
+		if (!test_cases[i].is_valid) {
+			printf("Invalid test case %d.\n\n", i + 1);
+			continue;
+		}
+
+		cpid = fork();
+		if (cpid < 0) {
+			printf("Fork case %d failed.\n", i + 1);
+			exit(EXIT_FAILURE);
+		} else if (cpid == 0) {
+			printf("\nRunning case %u\n\n", i + 1);
+
+			new_argc = append_eal_args(argc, argv, test_cases[i].eal_args, pargs);
+			ret = rte_eal_init(new_argc, pargs);
+			if (ret < 0)
+				rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+
+			/* Check lcores. */
+			nb_lcores = rte_lcore_count();
+			if (nb_lcores < 2)
+				rte_exit(EXIT_FAILURE,
+					"There should be at least 2 worker lcores.\n");
+
+			fd = fopen(rst_path_ptr, "a");
+			if (!fd) {
+				printf("Open output CSV file error.\n");
+				return 0;
+			}
+
+			if (is_first_case) {
+				output_env_info();
+				is_first_case = false;
+			}
+			run_test(i + 1, &test_cases[i]);
+
+			/* clean up the EAL */
+			rte_eal_cleanup();
+
+			fclose(fd);
+
+			printf("\nCase %u completed.\n\n", i + 1);
+
+			exit(EXIT_SUCCESS);
+		} else {
+			wpid = waitpid(cpid, &wstatus, 0);
+			if (wpid == -1) {
+				printf("waitpid error.\n");
+				exit(EXIT_FAILURE);
+			}
+
+			if (WIFEXITED(wstatus))
+				printf("Case process exited. status %d\n\n",
+					WEXITSTATUS(wstatus));
+			else if (WIFSIGNALED(wstatus))
+				printf("Case process killed by signal %d\n\n",
+					WTERMSIG(wstatus));
+			else if (WIFSTOPPED(wstatus))
+				printf("Case process stopped by signal %d\n\n",
+					WSTOPSIG(wstatus));
+			else if (WIFCONTINUED(wstatus))
+				printf("Case process continued.\n\n");
+			else
+				printf("Case process unknown terminated.\n\n");
+		}
+	}
+
+	printf("Bye...\n");
+	return 0;
+}
+
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
new file mode 100644
index 0000000000..215ac42673
--- /dev/null
+++ b/app/test-dma-perf/main.h
@@ -0,0 +1,69 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_dev.h>
+#include <rte_dmadev.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused	__rte_unused
+#endif
+
+#define MAX_WORKER_NB 128
+#define MAX_OUTPUT_STR_LEN 512
+
+#define MAX_DMA_NB 128
+#define MAX_LCORE_NB 256
+
+extern char output_str[MAX_WORKER_NB][MAX_OUTPUT_STR_LEN];
+
+typedef enum {
+	OP_NONE = 0,
+	OP_ADD,
+	OP_MUL
+} alg_op_type;
+
+struct test_configure_entry {
+	uint32_t first;
+	uint32_t last;
+	uint32_t incr;
+	alg_op_type op;
+	uint32_t cur;
+};
+
+struct lcore_dma_map_t {
+	uint32_t lcores[MAX_WORKER_NB];
+	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
+	int16_t dma_ids[MAX_WORKER_NB];
+	uint16_t cnt;
+};
+
+struct test_configure {
+	bool is_valid;
+	uint8_t test_type;
+	const char *test_type_str;
+	uint16_t src_numa_node;
+	uint16_t dst_numa_node;
+	uint16_t opcode;
+	bool is_dma;
+	struct lcore_dma_map_t lcore_dma_map;
+	struct test_configure_entry mem_size;
+	struct test_configure_entry buf_size;
+	struct test_configure_entry ring_size;
+	struct test_configure_entry kick_batch;
+	uint32_t cache_flush;
+	uint32_t nr_buf;
+	uint16_t test_secs;
+	const char *eal_args;
+	uint8_t scenario_id;
+};
+
+void mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+
+#endif /* _MAIN_H_ */
diff --git a/app/test-dma-perf/meson.build b/app/test-dma-perf/meson.build
new file mode 100644
index 0000000000..bd6c264002
--- /dev/null
+++ b/app/test-dma-perf/meson.build
@@ -0,0 +1,17 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2023 Intel Corporation
+
+# meson file, for building this app as part of a main DPDK build.
+
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+deps += ['dmadev', 'mbuf', 'cfgfile']
+
+sources = files(
+        'main.c',
+        'benchmark.c',
+)