[v10,4/4] app/dma-perf: add SG copy support

Message ID 20240227185631.3932799-1-amitprakashs@marvell.com (mailing list archive)
State Superseded, archived
Headers
Series None |

Commit Message

Amit Prakash Shukla Feb. 27, 2024, 6:56 p.m. UTC
  From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>

Add SG copy support.

Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
Acked-by: Anoob Joseph <anoobj@marvell.com>
Acked-by: Chengwen Feng <fengchengwen@huawei.com>
---
v10:
- SG config variables renamed.

 app/test-dma-perf/benchmark.c | 278 +++++++++++++++++++++++++++++-----
 app/test-dma-perf/config.ini  |  25 ++-
 app/test-dma-perf/main.c      |  34 ++++-
 app/test-dma-perf/main.h      |   5 +-
 4 files changed, 300 insertions(+), 42 deletions(-)
  

Comments

Chengwen Feng Feb. 28, 2024, 9:31 a.m. UTC | #1
Hi Gowrishankar,

On 2024/2/28 2:56, Amit Prakash Shukla wrote:
> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> 
> Add SG copy support.
> 
> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Acked-by: Anoob Joseph <anoobj@marvell.com>
> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
> v10:
> - SG config variables renamed.
> 
>  app/test-dma-perf/benchmark.c | 278 +++++++++++++++++++++++++++++-----
>  app/test-dma-perf/config.ini  |  25 ++-
>  app/test-dma-perf/main.c      |  34 ++++-
>  app/test-dma-perf/main.h      |   5 +-
>  4 files changed, 300 insertions(+), 42 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 0047e2f4b8..25ed6fa6d0 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -46,6 +46,10 @@ struct lcore_params {
>  	uint16_t test_secs;
>  	struct rte_mbuf **srcs;
>  	struct rte_mbuf **dsts;
> +	struct rte_dma_sge *src_sges;
> +	struct rte_dma_sge *dst_sges;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;

1. src/dst_ptrs -> src/dst_nb_sge
2. How about wrap these four fields as a struct?

>  	volatile struct worker_info worker_info;
>  };
>  
> @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
>  }
>  
>  static void
> -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
> -			float memory, float bandwidth, float mops, bool is_dma)
> +output_result(struct test_configure *cfg, struct lcore_params *para,
> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
> +			uint32_t nr_buf, float memory, float bandwidth, float mops)
>  {
> -	if (is_dma)
> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
> -				lcore_id, dma_name, ring_size, kick_batch);
> -	else
> +	uint16_t ring_size = cfg->ring_size.cur;
> +	uint8_t scenario_id = cfg->scenario_id;
> +	uint32_t lcore_id = para->lcore_id;
> +	char *dma_name = para->dma_name;
> +
> +	if (cfg->is_dma) {
> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
> +		       dma_name, ring_size, kick_batch);
> +		if (cfg->is_sg)
> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> +			       para->src_ptrs, para->dst_ptrs);

DMA src sges: %u DMA dst sges: %u

I think we should add a column which title maybe misc, some like sg-src[4]-dst[1],
and later we may add fill test, then this field could be pattern-0x12345678

And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if the DMA was
worked in non-mem2mem direction, we could add simple descriptor of direction and pcie.info
in the above misc column.

> +		printf(".\n");
> +	} else {
>  		printf("lcore %u\n", lcore_id);
> +	}
>  
>  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
>  			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
>  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
>  			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
>  			nr_buf, memory, ave_cycle, bandwidth, mops);
> @@ -167,7 +181,7 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>  
>  /* Configuration of device. */
>  static void
> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
>  {
>  	uint16_t vchan = 0;
>  	struct rte_dma_info info;
> @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
>  				dev_id);
>  
> +	if (info.max_sges < ptrs_max)
> +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",

"Error with unsupport max_sges on device id %u.\n"

> +				dev_id);
> +
>  	if (rte_dma_start(dev_id) != 0)
>  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
>  }
> @@ -202,8 +220,12 @@ config_dmadevs(struct test_configure *cfg)
>  	uint32_t i;
>  	int dev_id;
>  	uint16_t nb_dmadevs = 0;
> +	uint8_t ptrs_max = 0;

It hard to understand, how about nb_sge?

>  	char *dma_name;
>  
> +	if (cfg->is_sg)
> +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
> +
>  	for (i = 0; i < ldm->cnt; i++) {
>  		dma_name = ldm->dma_names[i];
>  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
>  		}
>  
>  		ldm->dma_ids[i] = dev_id;
> -		configure_dmadev_queue(dev_id, cfg);
> +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
>  		++nb_dmadevs;
>  	}
>  
> @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
>  }
>  
>  static inline int
> -do_dma_mem_copy(void *p)
> +do_dma_plain_mem_copy(void *p)
>  {
>  	struct lcore_params *para = (struct lcore_params *)p;
>  	volatile struct worker_info *worker_info = &(para->worker_info);
> @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
>  	return 0;
>  }
>  
> +static inline int
> +do_dma_sg_mem_copy(void *p)
> +{
> +	struct lcore_params *para = (struct lcore_params *)p;
> +	volatile struct worker_info *worker_info = &(para->worker_info);
> +	struct rte_dma_sge *src_sges = para->src_sges;
> +	struct rte_dma_sge *dst_sges = para->dst_sges;
> +	const uint16_t kick_batch = para->kick_batch;
> +	const uint8_t src_ptrs = para->src_ptrs;
> +	const uint8_t dst_ptrs = para->dst_ptrs;
> +	const uint16_t dev_id = para->dev_id;
> +	uint32_t nr_buf = para->nr_buf;
> +	uint64_t async_cnt = 0;
> +	uint32_t poll_cnt = 0;
> +	uint16_t nr_cpl;
> +	uint32_t i, j;
> +	int ret;
> +
> +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
> +	worker_info->stop_flag = false;
> +	worker_info->ready_flag = true;
> +
> +	while (!worker_info->start_flag)
> +		;
> +
> +	while (1) {
> +		j = 0;
> +		for (i = 0; i < nr_buf; i++) {
> +dma_copy:
> +			ret = rte_dma_copy_sg(dev_id, 0,
> +				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
> +				src_ptrs, dst_ptrs, 0);
> +			if (unlikely(ret < 0)) {
> +				if (ret == -ENOSPC) {
> +					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +					goto dma_copy;
> +				} else
> +					error_exit(dev_id);
> +			}
> +			async_cnt++;
> +			j++;
> +
> +			if ((async_cnt % kick_batch) == 0)
> +				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
> +		}
> +
> +		if (worker_info->stop_flag)
> +			break;
> +	}
> +
> +	rte_dma_submit(dev_id, 0);
> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
> +		async_cnt -= nr_cpl;
> +	}
> +
> +	return 0;
> +}
> +
>  static inline int
>  do_cpu_mem_copy(void *p)
>  {
> @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
>  }
>  
>  static int
> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> -			struct rte_mbuf ***dsts)
> +setup_memory_env(struct test_configure *cfg,
> +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
> +			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
>  {
>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>  	unsigned int cur_buf_size = cfg->buf_size.cur;
> @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  	}
>  
>  	for (i = 0; i < nr_buf; i++) {
> -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
> -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
>  	}
>  
>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>  		}
>  	}
>  
> +	if (cfg->is_sg) {
> +		uint8_t src_ptrs = cfg->src_ptrs;
> +		uint8_t dst_ptrs = cfg->dst_ptrs;
> +		uint32_t sglen_src, sglen_dst;
> +
> +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*src_sges == NULL) {
> +			printf("Error: src_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
> +					RTE_CACHE_LINE_SIZE);
> +		if (*dst_sges == NULL) {
> +			printf("Error: dst_sges array malloc failed.\n");
> +			return -1;
> +		}
> +
> +		sglen_src = cur_buf_size / src_ptrs;
> +		sglen_dst = cur_buf_size / dst_ptrs;
> +
> +		for (i = 0; i < nr_buf; i++) {
> +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
> +			(*src_sges)[i].length = sglen_src;
> +			if (!((i+1) % src_ptrs))
> +				(*src_sges)[i].length += (cur_buf_size % src_ptrs);
> +
> +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
> +			(*dst_sges)[i].length = sglen_dst;
> +			if (!((i+1) % dst_ptrs))
> +				(*dst_sges)[i].length += (cur_buf_size % dst_ptrs);
> +		}
> +	}
> +
>  	return 0;
>  }
>  
>  int
> -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> +mem_copy_benchmark(struct test_configure *cfg)
>  {
> -	uint32_t i;
> +	uint32_t i, j;
>  	uint32_t offset;
>  	unsigned int lcore_id = 0;
>  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
> +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> +	const uint32_t mcore_id = rte_get_main_lcore();
>  	unsigned int buf_size = cfg->buf_size.cur;
>  	uint16_t kick_batch = cfg->kick_batch.cur;
> -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
>  	uint16_t nb_workers = ldm->cnt;
>  	uint16_t test_secs = cfg->test_secs;
>  	float memory = 0;
> @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	uint32_t avg_cycles_total;
>  	float mops, mops_total;
>  	float bandwidth, bandwidth_total;
> +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
> +	uint32_t nr_buf;
>  	int ret = 0;
>  
> -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +	/* Align number of buffers according to workers count */
> +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> +	nr_buf -= (nr_buf % nb_workers);
> +	if (cfg->is_sg) {
> +		nr_buf /= nb_workers;
> +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
> +		nr_buf *= nb_workers;
> +
> +		if (cfg->dst_ptrs > cfg->src_ptrs) {
> +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
> +			nr_sgdst = nr_buf;
> +		} else {
> +			nr_sgsrc = nr_buf;
> +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
> +		}
> +	}

pls move to a subfunction

> +
> +	cfg->nr_buf = nr_buf;
> +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
>  		goto out;
>  
> -	if (is_dma)
> +	if (cfg->is_dma)
>  		if (config_dmadevs(cfg) < 0)
>  			goto out;
>  
> @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	for (i = 0; i < nb_workers; i++) {
>  		lcore_id = ldm->lcores[i];
> +		if (lcore_id == mcore_id) {
> +			printf("lcore parameters can not use main core id %d\n", mcore_id);
> +			goto out;
> +		}
> +
> +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
> +			printf("lcore parameters can not use offline core id %d\n", lcore_id);
> +			goto out;
> +		}

The above two judgement should in a seperate commit.

> +
>  		offset = nr_buf / nb_workers * i;
>  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
>  		if (lcores[i] == NULL) {
>  			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
>  			break;
>  		}
> -		if (is_dma) {
> +		if (cfg->is_dma) {
>  			lcores[i]->dma_name = ldm->dma_names[i];
>  			lcores[i]->dev_id = ldm->dma_ids[i];
>  			lcores[i]->kick_batch = kick_batch;
> @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		lcores[i]->scenario_id = cfg->scenario_id;
>  		lcores[i]->lcore_id = lcore_id;
>  
> -		if (is_dma)
> -			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
> -		else
> +		if (cfg->is_sg) {
> +			lcores[i]->src_ptrs = cfg->src_ptrs;
> +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
> +			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +		}
> +
> +		if (cfg->is_dma) {
> +			if (!cfg->is_sg)
> +				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +			else
> +				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
> +					lcore_id);
> +		} else {
>  			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
> +		}

too many judgement for selecting target function, how about wrap it subfunction:
lcore_function_t get_work_function(struct test_configure *cfg)
then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]), lcore_id);

>  	}
>  
>  	while (1) {
> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  
>  	rte_eal_mp_wait_lcore();
>  
> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> -			   rte_pktmbuf_mtod(dsts[i], void *),
> -			   cfg->buf_size.cur) != 0) {
> -			printf("Copy validation fails for buffer number %d\n", i);
> -			ret = -1;
> -			goto out;
> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> +					rte_pktmbuf_mtod(dsts[i], void *),
> +					cfg->buf_size.cur) != 0) {
> +				printf("Copy validation fails for buffer number %d\n", i);
> +				ret = -1;
> +				goto out;
> +			}
> +		}
> +	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
> +		size_t src_remsz = buf_size % cfg->src_ptrs;
> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> +		size_t src_sz = buf_size / cfg->src_ptrs;
> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> +		uint8_t src[buf_size], dst[buf_size];
> +		uint8_t *sbuf, *dbuf, *ptr;
> +
> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
> +			sbuf = src;
> +			dbuf = dst;
> +			ptr = NULL;
> +
> +			for (j = 0; j < cfg->src_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
> +				memcpy(sbuf, ptr, src_sz);
> +				sbuf += src_sz;
> +			}
> +
> +			if (src_remsz)
> +				memcpy(sbuf, ptr + src_sz, src_remsz);
> +
> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
> +				memcpy(dbuf, ptr, dst_sz);
> +				dbuf += dst_sz;
> +			}
> +
> +			if (dst_remsz)
> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> +
> +			if (memcmp(src, dst, buf_size) != 0) {
> +				printf("SG Copy validation fails for buffer number %d\n",
> +					i * cfg->src_ptrs);
> +				ret = -1;
> +				goto out;
> +			}

Now I doubt the value of verify, this verify can't find the middle round copy failure,
because as long as the last round copy is successful, the validation will pass.

And adding validatation in every round copy will impact performance.

Also app/test_dmadev already verify data. so I think we should drop the validation commit.

>  		}
>  	}
>  
> @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>  			lcores[i]->worker_info.test_cpl,
>  			&memory, &avg_cycles, &bandwidth, &mops);
> -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
> -					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
> -					avg_cycles, buf_size, nr_buf / nb_workers, memory,
> -					bandwidth, mops, is_dma);
> +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
> +			nr_buf / nb_workers, memory, bandwidth, mops);
>  		mops_total += mops;
>  		bandwidth_total += bandwidth;
>  		avg_cycles_total += avg_cycles;
> @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>  	rte_mempool_free(dst_pool);
>  	dst_pool = NULL;
>  
> +	/* free sges for mbufs */
> +	rte_free(src_sges);
> +	src_sges = NULL;
> +
> +	rte_free(dst_sges);
> +	dst_sges = NULL;
> +
>  	/* free the worker parameters */
>  	for (i = 0; i < nb_workers; i++) {
>  		rte_free(lcores[i]);
>  		lcores[i] = NULL;
>  	}
>  
> -	if (is_dma) {
> +	if (cfg->is_dma) {
>  		for (i = 0; i < nb_workers; i++) {
>  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
>  			rte_dma_stop(ldm->dma_ids[i]);
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index 9c8221025e..28f6c9d1db 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -38,6 +38,14 @@
>  
>  ; "skip" To skip a test-case set skip to 1.

Please place hese patchset new add entrys' descriptions above the
"; To specify a configuration file, use the "--config" flag followed by the path to the file."

because original config.ini, fist is parameters descriptor, and then program argment descriptor, and last was example.

>  
> +; Parameters to be configured for SG copy:

Parameters for DMA scatter-gather memory copy:

> +; ========================================

Please remove this line

> +; "dma_src_sge" denotes number of source segments.
> +; "dma_dst_sge" denotes number of destination segments.
> +;
> +; For SG copy, both the parameters need to be configured and they are valid only
> +; when type is DMA_MEM_COPY.

For DMA scatter-gather memory copy, the parameters need to be configured and they are valid only
when type is DMA_MEM_COPY.

> +;
>  ; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
>  ; ==================================================================================

Please remove this line

As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional transfer"'s review feedback,
these descriptor should place after
"
; To use DMA for a test, please specify the "lcore_dma" parameter.
; If you have already set the "-l" and "-a" parameters using EAL,
; make sure that the value of "lcore_dma" falls within their range of the values.
; We have to ensure a 1:1 mapping between the core and DMA device.
"


>  ; "direction" denotes the direction of data transfer. It can take 3 values:
> @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
>  [case2]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +dma_src_sge=4
> +dma_dst_sge=1
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +test_seconds=2
> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> +eal_args=--in-memory --file-prefix=test
> +
> +[case3]
>  skip=1
>  type=DMA_MEM_COPY
>  direction=dev2mem
> @@ -84,7 +107,7 @@ test_seconds=2
>  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>  eal_args=--in-memory --file-prefix=test
>  
> -[case3]
> +[case4]
>  type=CPU_MEM_COPY
>  mem_size=10
>  buf_size=64,8192,2,MUL
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index df05bcd7df..a27e4c9429 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
>  
>  	switch (case_cfg->test_type) {
>  	case TEST_TYPE_DMA_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, true);
> -		break;
>  	case TEST_TYPE_CPU_MEM_COPY:
> -		ret = mem_copy_benchmark(case_cfg, false);
> +		ret = mem_copy_benchmark(case_cfg);
>  		break;
>  	default:
>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> @@ -365,7 +363,8 @@ load_configs(const char *path)
>  	const char *case_type;
>  	const char *transfer_dir;
>  	const char *lcore_dma;
> -	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
> +	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
> +		*src_ptrs_str, *dst_ptrs_str;
>  	const char *skip;
>  	struct rte_kvargs *kvlist;
>  	const char *vchan_dev;
> @@ -467,6 +466,7 @@ load_configs(const char *path)
>  			rte_kvargs_free(kvlist);
>  		}
>  
> +		test_case->is_dma = is_dma;
>  		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
>  								section_name, "src_numa_node"));
>  		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> @@ -501,6 +501,32 @@ load_configs(const char *path)
>  			} else if (args_nr == 4)
>  				nb_vp++;
>  
> +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_src_sge");
> +			if (src_ptrs_str != NULL) {
> +				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_src_sge"));
> +			}
> +
> +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
> +								"dma_dst_sge");
> +			if (dst_ptrs_str != NULL) {
> +				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
> +								section_name, "dma_dst_sge"));
> +			}
> +
> +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
> +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {

Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure there are >1 and <=UINT16_MAX

> +				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
> +					i + 1);
> +				test_case->is_valid = false;
> +				continue;
> +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
> +				test_case->is_sg = true;
> +			} else {
> +				test_case->is_sg = false;

the above could simple by: test_case->is_sg = (src_ptrs_str != NULL && dst_ptrs_str != NULL);

> +			}
> +
>  			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
>  			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
>  			if (args_nr < 0) {
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 1123e7524a..baf149b72b 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -53,11 +53,14 @@ struct test_configure {
>  	uint16_t dst_numa_node;
>  	uint16_t opcode;
>  	bool is_dma;
> +	bool is_sg;
>  	struct lcore_dma_map_t lcore_dma_map;
>  	struct test_configure_entry mem_size;
>  	struct test_configure_entry buf_size;
>  	struct test_configure_entry ring_size;
>  	struct test_configure_entry kick_batch;
> +	uint8_t src_ptrs;
> +	uint8_t dst_ptrs;
>  	uint8_t cache_flush;
>  	uint32_t nr_buf;
>  	uint16_t test_secs;
> @@ -66,6 +69,6 @@ struct test_configure {
>  	struct test_vchan_dev_config vchan_dev;
>  };
>  
> -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> +int mem_copy_benchmark(struct test_configure *cfg);
>  
>  #endif /* MAIN_H */
>
  
Gowrishankar Muthukrishnan Feb. 29, 2024, 1:16 p.m. UTC | #2
Hi Fengcheng,

> -----Original Message-----
> From: fengchengwen <fengchengwen@huawei.com>
> Sent: Wednesday, February 28, 2024 3:02 PM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerinj@marvell.com>; Anoob Joseph
> <anoobj@marvell.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Gowrishankar Muthukrishnan
> <gmuthukrishn@marvell.com>
> Subject: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Gowrishankar,
> 
> On 2024/2/28 2:56, Amit Prakash Shukla wrote:
> > From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> >
> > Add SG copy support.
> >
> > Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> > Acked-by: Anoob Joseph <anoobj@marvell.com>
> > Acked-by: Chengwen Feng <fengchengwen@huawei.com>
> > ---
> > v10:
> > - SG config variables renamed.
> >
> >  app/test-dma-perf/benchmark.c | 278
> > +++++++++++++++++++++++++++++-----
> >  app/test-dma-perf/config.ini  |  25 ++-
> >  app/test-dma-perf/main.c      |  34 ++++-
> >  app/test-dma-perf/main.h      |   5 +-
> >  4 files changed, 300 insertions(+), 42 deletions(-)
> >
> > diff --git a/app/test-dma-perf/benchmark.c
> > b/app/test-dma-perf/benchmark.c index 0047e2f4b8..25ed6fa6d0 100644
> > --- a/app/test-dma-perf/benchmark.c
> > +++ b/app/test-dma-perf/benchmark.c
> > @@ -46,6 +46,10 @@ struct lcore_params {
> >  	uint16_t test_secs;
> >  	struct rte_mbuf **srcs;
> >  	struct rte_mbuf **dsts;
> > +	struct rte_dma_sge *src_sges;
> > +	struct rte_dma_sge *dst_sges;
> > +	uint8_t src_ptrs;
> > +	uint8_t dst_ptrs;
> 
> 1. src/dst_ptrs -> src/dst_nb_sge
Ack.

> 2. How about wrap these four fields as a struct?
Ack.

> 
> >  	volatile struct worker_info worker_info;  };
> >
> > @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf,
> > uint16_t nb_workers, uint16_t te  }
> >
> >  static void
> > -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
> uint16_t ring_size,
> > -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> buf_size, uint32_t nr_buf,
> > -			float memory, float bandwidth, float mops, bool
> is_dma)
> > +output_result(struct test_configure *cfg, struct lcore_params *para,
> > +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> buf_size,
> > +			uint32_t nr_buf, float memory, float bandwidth, float
> mops)
> >  {
> > -	if (is_dma)
> > -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> %u.\n",
> > -				lcore_id, dma_name, ring_size, kick_batch);
> > -	else
> > +	uint16_t ring_size = cfg->ring_size.cur;
> > +	uint8_t scenario_id = cfg->scenario_id;
> > +	uint32_t lcore_id = para->lcore_id;
> > +	char *dma_name = para->dma_name;
> > +
> > +	if (cfg->is_dma) {
> > +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> %u", lcore_id,
> > +		       dma_name, ring_size, kick_batch);
> > +		if (cfg->is_sg)
> > +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> > +			       para->src_ptrs, para->dst_ptrs);
> 
> DMA src sges: %u DMA dst sges: %u
> 
> I think we should add a column which title maybe misc, some like sg-src[4]-
> dst[1], and later we may add fill test, then this field could be pattern-
> 0x12345678
> 
> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if
> the DMA was worked in non-mem2mem direction, we could add simple
> descriptor of direction and pcie.info in the above misc column.
> 

I am sorry, I could not understand complete picture here. Do you mean we 
reserve a column and use it as per test type.

For plain mem copy, nothing added.
For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print "sg-src[1]-dst[4]".
In future, when we add fill test in benchmark, this line instead be "pattern-0x12345678".

Is my understanding correct over here ?

> > +		printf(".\n");
> > +	} else {
> >  		printf("lcore %u\n", lcore_id);
> > +	}
> >
> >  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer
> Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
> >  			ave_cycle, buf_size, nr_buf, memory,
> rte_get_timer_hz()/1000000000.0);
> >  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth,
> > mops);
> >
> > -	if (is_dma)
> > +	if (cfg->is_dma)
> >  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> CSV_LINE_DMA_FMT,
> >  			scenario_id, lcore_id, dma_name, ring_size,
> kick_batch, buf_size,
> >  			nr_buf, memory, ave_cycle, bandwidth, mops); @@ -
> 167,7 +181,7 @@
> > vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
> >
> >  /* Configuration of device. */
> >  static void
> > -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
> > +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg,
> > +uint8_t ptrs_max)
> >  {
> >  	uint16_t vchan = 0;
> >  	struct rte_dma_info info;
> > @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct
> test_configure *cfg)
> >  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported
> on device id. %u\n",
> >  				dev_id);
> >
> > +	if (info.max_sges < ptrs_max)
> > +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported
> by
> > +device id %u.\n",
> 
> "Error with unsupport max_sges on device id %u.\n"
Ack.

> 
> > +				dev_id);
> > +
> >  	if (rte_dma_start(dev_id) != 0)
> >  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");  } @@ -
> 202,8
> > +220,12 @@ config_dmadevs(struct test_configure *cfg)
> >  	uint32_t i;
> >  	int dev_id;
> >  	uint16_t nb_dmadevs = 0;
> > +	uint8_t ptrs_max = 0;
> 
> It hard to understand, how about nb_sge?

Ack. Renamed it to nb_sges.
> 
> >  	char *dma_name;
> >
> > +	if (cfg->is_sg)
> > +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
> > +
> >  	for (i = 0; i < ldm->cnt; i++) {
> >  		dma_name = ldm->dma_names[i];
> >  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
> > @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
> >  		}
> >
> >  		ldm->dma_ids[i] = dev_id;
> > -		configure_dmadev_queue(dev_id, cfg);
> > +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
> >  		++nb_dmadevs;
> >  	}
> >
> > @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id,
> uint64_t *async_cnt,
> >  }
> >
> >  static inline int
> > -do_dma_mem_copy(void *p)
> > +do_dma_plain_mem_copy(void *p)
> >  {
> >  	struct lcore_params *para = (struct lcore_params *)p;
> >  	volatile struct worker_info *worker_info = &(para->worker_info);
> > @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
> >  	return 0;
> >  }
> >
> > +static inline int
> > +do_dma_sg_mem_copy(void *p)
> > +{
> > +	struct lcore_params *para = (struct lcore_params *)p;
> > +	volatile struct worker_info *worker_info = &(para->worker_info);
> > +	struct rte_dma_sge *src_sges = para->src_sges;
> > +	struct rte_dma_sge *dst_sges = para->dst_sges;
> > +	const uint16_t kick_batch = para->kick_batch;
> > +	const uint8_t src_ptrs = para->src_ptrs;
> > +	const uint8_t dst_ptrs = para->dst_ptrs;
> > +	const uint16_t dev_id = para->dev_id;
> > +	uint32_t nr_buf = para->nr_buf;
> > +	uint64_t async_cnt = 0;
> > +	uint32_t poll_cnt = 0;
> > +	uint16_t nr_cpl;
> > +	uint32_t i, j;
> > +	int ret;
> > +
> > +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
> > +	worker_info->stop_flag = false;
> > +	worker_info->ready_flag = true;
> > +
> > +	while (!worker_info->start_flag)
> > +		;
> > +
> > +	while (1) {
> > +		j = 0;
> > +		for (i = 0; i < nr_buf; i++) {
> > +dma_copy:
> > +			ret = rte_dma_copy_sg(dev_id, 0,
> > +				&src_sges[i * src_ptrs], &dst_sges[j *
> dst_ptrs],
> > +				src_ptrs, dst_ptrs, 0);
> > +			if (unlikely(ret < 0)) {
> > +				if (ret == -ENOSPC) {
> > +					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> > +					goto dma_copy;
> > +				} else
> > +					error_exit(dev_id);
> > +			}
> > +			async_cnt++;
> > +			j++;
> > +
> > +			if ((async_cnt % kick_batch) == 0)
> > +				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> > +		}
> > +
> > +		if (worker_info->stop_flag)
> > +			break;
> > +	}
> > +
> > +	rte_dma_submit(dev_id, 0);
> > +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
> > +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
> NULL, NULL);
> > +		async_cnt -= nr_cpl;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static inline int
> >  do_cpu_mem_copy(void *p)
> >  {
> > @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
> >  }
> >
> >  static int
> > -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> > -			struct rte_mbuf ***dsts)
> > +setup_memory_env(struct test_configure *cfg,
> > +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
> > +			 struct rte_dma_sge **src_sges, struct rte_dma_sge
> **dst_sges)
> >  {
> >  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
> >  	unsigned int cur_buf_size = cfg->buf_size.cur;
> > @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
> >  	}
> >
> >  	for (i = 0; i < nr_buf; i++) {
> > -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
> buf_size);
> > -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> > +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
> cur_buf_size);
> > +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0,
> cur_buf_size);
> >  	}
> >
> >  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> > @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg,
> struct rte_mbuf ***srcs,
> >  		}
> >  	}
> >
> > +	if (cfg->is_sg) {
> > +		uint8_t src_ptrs = cfg->src_ptrs;
> > +		uint8_t dst_ptrs = cfg->dst_ptrs;
> > +		uint32_t sglen_src, sglen_dst;
> > +
> > +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
> rte_dma_sge),
> > +					RTE_CACHE_LINE_SIZE);
> > +		if (*src_sges == NULL) {
> > +			printf("Error: src_sges array malloc failed.\n");
> > +			return -1;
> > +		}
> > +
> > +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
> rte_dma_sge),
> > +					RTE_CACHE_LINE_SIZE);
> > +		if (*dst_sges == NULL) {
> > +			printf("Error: dst_sges array malloc failed.\n");
> > +			return -1;
> > +		}
> > +
> > +		sglen_src = cur_buf_size / src_ptrs;
> > +		sglen_dst = cur_buf_size / dst_ptrs;
> > +
> > +		for (i = 0; i < nr_buf; i++) {
> > +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
> > +			(*src_sges)[i].length = sglen_src;
> > +			if (!((i+1) % src_ptrs))
> > +				(*src_sges)[i].length += (cur_buf_size %
> src_ptrs);
> > +
> > +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
> > +			(*dst_sges)[i].length = sglen_dst;
> > +			if (!((i+1) % dst_ptrs))
> > +				(*dst_sges)[i].length += (cur_buf_size %
> dst_ptrs);
> > +		}
> > +	}
> > +
> >  	return 0;
> >  }
> >
> >  int
> > -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
> > +mem_copy_benchmark(struct test_configure *cfg)
> >  {
> > -	uint32_t i;
> > +	uint32_t i, j;
> >  	uint32_t offset;
> >  	unsigned int lcore_id = 0;
> >  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
> > +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
> >  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
> > +	const uint32_t mcore_id = rte_get_main_lcore();
> >  	unsigned int buf_size = cfg->buf_size.cur;
> >  	uint16_t kick_batch = cfg->kick_batch.cur;
> > -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
> (cfg->buf_size.cur * 2);
> >  	uint16_t nb_workers = ldm->cnt;
> >  	uint16_t test_secs = cfg->test_secs;
> >  	float memory = 0;
> > @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  	uint32_t avg_cycles_total;
> >  	float mops, mops_total;
> >  	float bandwidth, bandwidth_total;
> > +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
> > +	uint32_t nr_buf;
> >  	int ret = 0;
> >
> > -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> > +	/* Align number of buffers according to workers count */
> > +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
> > +	nr_buf -= (nr_buf % nb_workers);
> > +	if (cfg->is_sg) {
> > +		nr_buf /= nb_workers;
> > +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
> > +		nr_buf *= nb_workers;
> > +
> > +		if (cfg->dst_ptrs > cfg->src_ptrs) {
> > +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
> > +			nr_sgdst = nr_buf;
> > +		} else {
> > +			nr_sgsrc = nr_buf;
> > +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
> > +		}
> > +	}
> 
> pls move to a subfunction
Ack.

> 
> > +
> > +	cfg->nr_buf = nr_buf;
> > +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
> >  		goto out;
> >
> > -	if (is_dma)
> > +	if (cfg->is_dma)
> >  		if (config_dmadevs(cfg) < 0)
> >  			goto out;
> >
> > @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >
> >  	for (i = 0; i < nb_workers; i++) {
> >  		lcore_id = ldm->lcores[i];
> > +		if (lcore_id == mcore_id) {
> > +			printf("lcore parameters can not use main core id
> %d\n", mcore_id);
> > +			goto out;
> > +		}
> > +
> > +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
> > +			printf("lcore parameters can not use offline core id
> %d\n", lcore_id);
> > +			goto out;
> > +		}
> 
> The above two judgement should in a seperate commit.

Sorry, somehow it got mixed from different patch I had in my local repo.
It will be in different commit.

> 
> > +
> >  		offset = nr_buf / nb_workers * i;
> >  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
> >  		if (lcores[i] == NULL) {
> >  			printf("lcore parameters malloc failure for lcore %d\n",
> lcore_id);
> >  			break;
> >  		}
> > -		if (is_dma) {
> > +		if (cfg->is_dma) {
> >  			lcores[i]->dma_name = ldm->dma_names[i];
> >  			lcores[i]->dev_id = ldm->dma_ids[i];
> >  			lcores[i]->kick_batch = kick_batch;
> > @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  		lcores[i]->scenario_id = cfg->scenario_id;
> >  		lcores[i]->lcore_id = lcore_id;
> >
> > -		if (is_dma)
> > -			rte_eal_remote_launch(do_dma_mem_copy, (void
> *)(lcores[i]), lcore_id);
> > -		else
> > +		if (cfg->is_sg) {
> > +			lcores[i]->src_ptrs = cfg->src_ptrs;
> > +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> > +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers
> * i);
> > +			lcores[i]->dst_sges = dst_sges + (nr_sgdst /
> nb_workers * i);
> > +		}
> > +
> > +		if (cfg->is_dma) {
> > +			if (!cfg->is_sg)
> > +
> 	rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
> > +					lcore_id);
> > +			else
> > +
> 	rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
> > +					lcore_id);
> > +		} else {
> >  			rte_eal_remote_launch(do_cpu_mem_copy, (void
> *)(lcores[i]), lcore_id);
> > +		}
> 
> too many judgement for selecting target function, how about wrap it
> subfunction:
> lcore_function_t get_work_function(struct test_configure *cfg)
> then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
> lcore_id);
> 
Ack.

> >  	}
> >
> >  	while (1) {
> > @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >
> >  	rte_eal_mp_wait_lcore();
> >
> > -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> > -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> > -			   rte_pktmbuf_mtod(dsts[i], void *),
> > -			   cfg->buf_size.cur) != 0) {
> > -			printf("Copy validation fails for buffer number %d\n",
> i);
> > -			ret = -1;
> > -			goto out;
> > +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
> {
> > +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> > +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> > +					rte_pktmbuf_mtod(dsts[i], void *),
> > +					cfg->buf_size.cur) != 0) {
> > +				printf("Copy validation fails for buffer number
> %d\n", i);
> > +				ret = -1;
> > +				goto out;
> > +			}
> > +		}
> > +	} else if (cfg->is_sg && cfg->transfer_dir ==
> RTE_DMA_DIR_MEM_TO_MEM) {
> > +		size_t src_remsz = buf_size % cfg->src_ptrs;
> > +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> > +		size_t src_sz = buf_size / cfg->src_ptrs;
> > +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> > +		uint8_t src[buf_size], dst[buf_size];
> > +		uint8_t *sbuf, *dbuf, *ptr;
> > +
> > +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
> i++) {
> > +			sbuf = src;
> > +			dbuf = dst;
> > +			ptr = NULL;
> > +
> > +			for (j = 0; j < cfg->src_ptrs; j++) {
> > +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
> + j], uint8_t *);
> > +				memcpy(sbuf, ptr, src_sz);
> > +				sbuf += src_sz;
> > +			}
> > +
> > +			if (src_remsz)
> > +				memcpy(sbuf, ptr + src_sz, src_remsz);
> > +
> > +			for (j = 0; j < cfg->dst_ptrs; j++) {
> > +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
> + j], uint8_t *);
> > +				memcpy(dbuf, ptr, dst_sz);
> > +				dbuf += dst_sz;
> > +			}
> > +
> > +			if (dst_remsz)
> > +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> > +
> > +			if (memcmp(src, dst, buf_size) != 0) {
> > +				printf("SG Copy validation fails for buffer
> number %d\n",
> > +					i * cfg->src_ptrs);
> > +				ret = -1;
> > +				goto out;
> > +			}
> 
> Now I doubt the value of verify, this verify can't find the middle round copy
> failure,
> because as long as the last round copy is successful, the validation will pass.
> 
Validation is on entire buffer. If any middle copy is a failure, entire memcmp
would have failed. Or do I miss something ?

> And adding validatation in every round copy will impact performance.
> 
This validation is just after worker function is stopped measuring perf.
How would this impact performance ?

> Also app/test_dmadev already verify data. so I think we should drop the
> validation commit.

Even in some corner cases or unknown issues, copy would have failed
and taking perf cycles then is meaningless. That is the reason, this validation
is added after perf function doing its job.

> 
> >  		}
> >  	}
> >
> > @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg,
> bool is_dma)
> >  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
> >  			lcores[i]->worker_info.test_cpl,
> >  			&memory, &avg_cycles, &bandwidth, &mops);
> > -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
> > -					lcores[i]->dma_name, cfg-
> >ring_size.cur, kick_batch,
> > -					avg_cycles, buf_size, nr_buf /
> nb_workers, memory,
> > -					bandwidth, mops, is_dma);
> > +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
> > +			nr_buf / nb_workers, memory, bandwidth, mops);
> >  		mops_total += mops;
> >  		bandwidth_total += bandwidth;
> >  		avg_cycles_total += avg_cycles;
> > @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure
> *cfg, bool is_dma)
> >  	rte_mempool_free(dst_pool);
> >  	dst_pool = NULL;
> >
> > +	/* free sges for mbufs */
> > +	rte_free(src_sges);
> > +	src_sges = NULL;
> > +
> > +	rte_free(dst_sges);
> > +	dst_sges = NULL;
> > +
> >  	/* free the worker parameters */
> >  	for (i = 0; i < nb_workers; i++) {
> >  		rte_free(lcores[i]);
> >  		lcores[i] = NULL;
> >  	}
> >
> > -	if (is_dma) {
> > +	if (cfg->is_dma) {
> >  		for (i = 0; i < nb_workers; i++) {
> >  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
> >  			rte_dma_stop(ldm->dma_ids[i]);
> > diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> > index 9c8221025e..28f6c9d1db 100644
> > --- a/app/test-dma-perf/config.ini
> > +++ b/app/test-dma-perf/config.ini
> > @@ -38,6 +38,14 @@
> >
> >  ; "skip" To skip a test-case set skip to 1.
> 
> Please place hese patchset new add entrys' descriptions above the
> "; To specify a configuration file, use the "--config" flag followed by the path to
> the file."
> 
> because original config.ini, fist is parameters descriptor, and then program
> argment descriptor, and last was example.
> 
Ack.
> >
> > +; Parameters to be configured for SG copy:
> 
> Parameters for DMA scatter-gather memory copy:
> 
Ack.
> > +; ========================================
> 
> Please remove this line
> 
Ack.
> > +; "dma_src_sge" denotes number of source segments.
> > +; "dma_dst_sge" denotes number of destination segments.
> > +;
> > +; For SG copy, both the parameters need to be configured and they are valid
> only
> > +; when type is DMA_MEM_COPY.
> 
> For DMA scatter-gather memory copy, the parameters need to be configured
> and they are valid only
> when type is DMA_MEM_COPY.
> 
Ack.
> > +;
> >  ; Parameters to be configured for data transfers from "mem to dev" and
> "dev to mem":
> >  ;
> ===================================================================
> ===============
> 
> Please remove this line
> 
> As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional
> transfer"'s review feedback,
> these descriptor should place after
> "
> ; To use DMA for a test, please specify the "lcore_dma" parameter.
> ; If you have already set the "-l" and "-a" parameters using EAL,
> ; make sure that the value of "lcore_dma" falls within their range of the values.
> ; We have to ensure a 1:1 mapping between the core and DMA device.
> "
> 
> 
> >  ; "direction" denotes the direction of data transfer. It can take 3 values:
> > @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2,
> lcore11@0000:00:04.3
> >  eal_args=--in-memory --file-prefix=test
> >
> >  [case2]
> > +type=DMA_MEM_COPY
> > +mem_size=10
> > +buf_size=64,8192,2,MUL
> > +dma_ring_size=1024
> > +dma_src_sge=4
> > +dma_dst_sge=1
> > +kick_batch=32
> > +src_numa_node=0
> > +dst_numa_node=0
> > +cache_flush=0
> > +test_seconds=2
> > +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> > +eal_args=--in-memory --file-prefix=test
> > +
> > +[case3]
> >  skip=1
> >  type=DMA_MEM_COPY
> >  direction=dev2mem
> > @@ -84,7 +107,7 @@ test_seconds=2
> >  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
> >  eal_args=--in-memory --file-prefix=test
> >
> > -[case3]
> > +[case4]
> >  type=CPU_MEM_COPY
> >  mem_size=10
> >  buf_size=64,8192,2,MUL
> > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> > index df05bcd7df..a27e4c9429 100644
> > --- a/app/test-dma-perf/main.c
> > +++ b/app/test-dma-perf/main.c
> > @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
> >
> >  	switch (case_cfg->test_type) {
> >  	case TEST_TYPE_DMA_MEM_COPY:
> > -		ret = mem_copy_benchmark(case_cfg, true);
> > -		break;
> >  	case TEST_TYPE_CPU_MEM_COPY:
> > -		ret = mem_copy_benchmark(case_cfg, false);
> > +		ret = mem_copy_benchmark(case_cfg);
> >  		break;
> >  	default:
> >  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
> > @@ -365,7 +363,8 @@ load_configs(const char *path)
> >  	const char *case_type;
> >  	const char *transfer_dir;
> >  	const char *lcore_dma;
> > -	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str;
> > +	const char *mem_size_str, *buf_size_str, *ring_size_str,
> *kick_batch_str,
> > +		*src_ptrs_str, *dst_ptrs_str;
> >  	const char *skip;
> >  	struct rte_kvargs *kvlist;
> >  	const char *vchan_dev;
> > @@ -467,6 +466,7 @@ load_configs(const char *path)
> >  			rte_kvargs_free(kvlist);
> >  		}
> >
> > +		test_case->is_dma = is_dma;
> >  		test_case->src_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> >
> 	section_name, "src_numa_node"));
> >  		test_case->dst_numa_node =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > @@ -501,6 +501,32 @@ load_configs(const char *path)
> >  			} else if (args_nr == 4)
> >  				nb_vp++;
> >
> > +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > +
> 	"dma_src_sge");
> > +			if (src_ptrs_str != NULL) {
> > +				test_case->src_ptrs =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> 	section_name, "dma_src_sge"));
> > +			}
> > +
> > +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile,
> section_name,
> > +
> 	"dma_dst_sge");
> > +			if (dst_ptrs_str != NULL) {
> > +				test_case->dst_ptrs =
> (int)atoi(rte_cfgfile_get_entry(cfgfile,
> > +
> 	section_name, "dma_dst_sge"));
> > +			}
> > +
> > +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
> > +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
> 
> Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure
> there are >1 and <=UINT16_MAX

At present, this is uint8_t. Do we need it more than UINT8_MAX ?

> 
> > +				printf("parse dma_src_sge, dma_dst_sge error
> in case %d.\n",
> > +					i + 1);
> > +				test_case->is_valid = false;
> > +				continue;
> > +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL)
> {
> > +				test_case->is_sg = true;
> > +			} else {
> > +				test_case->is_sg = false;
> 
> the above could simple by: test_case->is_sg = (src_ptrs_str != NULL &&
> dst_ptrs_str != NULL);
> 
Added check for nb_ validation here. Please check in next version of patch.
> > +			}
> > +
> >  			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
> section_name, "kick_batch");
> >  			args_nr = parse_entry(kick_batch_str, &test_case-
> >kick_batch);
> >  			if (args_nr < 0) {
> > diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> > index 1123e7524a..baf149b72b 100644
> > --- a/app/test-dma-perf/main.h
> > +++ b/app/test-dma-perf/main.h
> > @@ -53,11 +53,14 @@ struct test_configure {
> >  	uint16_t dst_numa_node;
> >  	uint16_t opcode;
> >  	bool is_dma;
> > +	bool is_sg;
> >  	struct lcore_dma_map_t lcore_dma_map;
> >  	struct test_configure_entry mem_size;
> >  	struct test_configure_entry buf_size;
> >  	struct test_configure_entry ring_size;
> >  	struct test_configure_entry kick_batch;
> > +	uint8_t src_ptrs;
> > +	uint8_t dst_ptrs;
> >  	uint8_t cache_flush;
> >  	uint32_t nr_buf;
> >  	uint16_t test_secs;
> > @@ -66,6 +69,6 @@ struct test_configure {
> >  	struct test_vchan_dev_config vchan_dev;
> >  };
> >
> > -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
> > +int mem_copy_benchmark(struct test_configure *cfg);
> >
> >  #endif /* MAIN_H */
> >

Thank you for your review. Please confirm if there are any other changes
and I hope next version goes through 😊

Regards,
Gowrishankar
  
Chengwen Feng March 1, 2024, 2:07 a.m. UTC | #3
Hi Gowrishankar,

On 2024/2/29 21:16, Gowrishankar Muthukrishnan wrote:
> Hi Fengcheng,
> 
>> -----Original Message-----
>> From: fengchengwen <fengchengwen@huawei.com>
>> Sent: Wednesday, February 28, 2024 3:02 PM
>> To: Amit Prakash Shukla <amitprakashs@marvell.com>; Cheng Jiang
>> <honest.jiang@foxmail.com>
>> Cc: dev@dpdk.org; Jerin Jacob <jerinj@marvell.com>; Anoob Joseph
>> <anoobj@marvell.com>; Kevin Laatz <kevin.laatz@intel.com>; Bruce
>> Richardson <bruce.richardson@intel.com>; Pavan Nikhilesh Bhagavatula
>> <pbhagavatula@marvell.com>; Gowrishankar Muthukrishnan
>> <gmuthukrishn@marvell.com>
>> Subject: [EXT] Re: [PATCH v10 4/4] app/dma-perf: add SG copy support
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> Hi Gowrishankar,
>>
>> On 2024/2/28 2:56, Amit Prakash Shukla wrote:
>>> From: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
>>>
>>> Add SG copy support.
>>>
>>> Signed-off-by: Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
>>> Acked-by: Anoob Joseph <anoobj@marvell.com>
>>> Acked-by: Chengwen Feng <fengchengwen@huawei.com>
>>> ---
>>> v10:
>>> - SG config variables renamed.
>>>
>>>  app/test-dma-perf/benchmark.c | 278
>>> +++++++++++++++++++++++++++++-----
>>>  app/test-dma-perf/config.ini  |  25 ++-
>>>  app/test-dma-perf/main.c      |  34 ++++-
>>>  app/test-dma-perf/main.h      |   5 +-
>>>  4 files changed, 300 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/app/test-dma-perf/benchmark.c
>>> b/app/test-dma-perf/benchmark.c index 0047e2f4b8..25ed6fa6d0 100644
>>> --- a/app/test-dma-perf/benchmark.c
>>> +++ b/app/test-dma-perf/benchmark.c
>>> @@ -46,6 +46,10 @@ struct lcore_params {
>>>  	uint16_t test_secs;
>>>  	struct rte_mbuf **srcs;
>>>  	struct rte_mbuf **dsts;
>>> +	struct rte_dma_sge *src_sges;
>>> +	struct rte_dma_sge *dst_sges;
>>> +	uint8_t src_ptrs;
>>> +	uint8_t dst_ptrs;
>>
>> 1. src/dst_ptrs -> src/dst_nb_sge
> Ack.
> 
>> 2. How about wrap these four fields as a struct?
> Ack.
> 
>>
>>>  	volatile struct worker_info worker_info;  };
>>>
>>> @@ -86,21 +90,31 @@ calc_result(uint32_t buf_size, uint32_t nr_buf,
>>> uint16_t nb_workers, uint16_t te  }
>>>
>>>  static void
>>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name,
>> uint16_t ring_size,
>>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>> buf_size, uint32_t nr_buf,
>>> -			float memory, float bandwidth, float mops, bool
>> is_dma)
>>> +output_result(struct test_configure *cfg, struct lcore_params *para,
>>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>> buf_size,
>>> +			uint32_t nr_buf, float memory, float bandwidth, float
>> mops)
>>>  {
>>> -	if (is_dma)
>>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>> %u.\n",
>>> -				lcore_id, dma_name, ring_size, kick_batch);
>>> -	else
>>> +	uint16_t ring_size = cfg->ring_size.cur;
>>> +	uint8_t scenario_id = cfg->scenario_id;
>>> +	uint32_t lcore_id = para->lcore_id;
>>> +	char *dma_name = para->dma_name;
>>> +
>>> +	if (cfg->is_dma) {
>>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>> %u", lcore_id,
>>> +		       dma_name, ring_size, kick_batch);
>>> +		if (cfg->is_sg)
>>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
>>> +			       para->src_ptrs, para->dst_ptrs);
>>
>> DMA src sges: %u DMA dst sges: %u
>>
>> I think we should add a column which title maybe misc, some like sg-src[4]-
>> dst[1], and later we may add fill test, then this field could be pattern-
>> 0x12345678
>>
>> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit, if
>> the DMA was worked in non-mem2mem direction, we could add simple
>> descriptor of direction and pcie.info in the above misc column.
>>
> 
> I am sorry, I could not understand complete picture here. Do you mean we 
> reserve a column and use it as per test type.
> 
> For plain mem copy, nothing added.
> For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print "sg-src[1]-dst[4]".
> In future, when we add fill test in benchmark, this line instead be "pattern-0x12345678".
> 
> Is my understanding correct over here ?

Yes, some like this.

> 
>>> +		printf(".\n");
>>> +	} else {
>>>  		printf("lcore %u\n", lcore_id);
>>> +	}
>>>
>>>  	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer
>> Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
>>>  			ave_cycle, buf_size, nr_buf, memory,
>> rte_get_timer_hz()/1000000000.0);
>>>  	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth,
>>> mops);
>>>
>>> -	if (is_dma)
>>> +	if (cfg->is_dma)
>>>  		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
>> CSV_LINE_DMA_FMT,
>>>  			scenario_id, lcore_id, dma_name, ring_size,
>> kick_batch, buf_size,
>>>  			nr_buf, memory, ave_cycle, bandwidth, mops); @@ -
>> 167,7 +181,7 @@
>>> vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>>>
>>>  /* Configuration of device. */
>>>  static void
>>> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
>>> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg,
>>> +uint8_t ptrs_max)
>>>  {
>>>  	uint16_t vchan = 0;
>>>  	struct rte_dma_info info;
>>> @@ -190,6 +204,10 @@ configure_dmadev_queue(uint32_t dev_id, struct
>> test_configure *cfg)
>>>  		rte_exit(EXIT_FAILURE, "Error, no configured queues reported
>> on device id. %u\n",
>>>  				dev_id);
>>>
>>> +	if (info.max_sges < ptrs_max)
>>> +		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported
>> by
>>> +device id %u.\n",
>>
>> "Error with unsupport max_sges on device id %u.\n"
> Ack.
> 
>>
>>> +				dev_id);
>>> +
>>>  	if (rte_dma_start(dev_id) != 0)
>>>  		rte_exit(EXIT_FAILURE, "Error with dma start.\n");  } @@ -
>> 202,8
>>> +220,12 @@ config_dmadevs(struct test_configure *cfg)
>>>  	uint32_t i;
>>>  	int dev_id;
>>>  	uint16_t nb_dmadevs = 0;
>>> +	uint8_t ptrs_max = 0;
>>
>> It hard to understand, how about nb_sge?
> 
> Ack. Renamed it to nb_sges.
>>
>>>  	char *dma_name;
>>>
>>> +	if (cfg->is_sg)
>>> +		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
>>> +
>>>  	for (i = 0; i < ldm->cnt; i++) {
>>>  		dma_name = ldm->dma_names[i];
>>>  		dev_id = rte_dma_get_dev_id_by_name(dma_name);
>>> @@ -213,7 +235,7 @@ config_dmadevs(struct test_configure *cfg)
>>>  		}
>>>
>>>  		ldm->dma_ids[i] = dev_id;
>>> -		configure_dmadev_queue(dev_id, cfg);
>>> +		configure_dmadev_queue(dev_id, cfg, ptrs_max);
>>>  		++nb_dmadevs;
>>>  	}
>>>
>>> @@ -253,7 +275,7 @@ do_dma_submit_and_poll(uint16_t dev_id,
>> uint64_t *async_cnt,
>>>  }
>>>
>>>  static inline int
>>> -do_dma_mem_copy(void *p)
>>> +do_dma_plain_mem_copy(void *p)
>>>  {
>>>  	struct lcore_params *para = (struct lcore_params *)p;
>>>  	volatile struct worker_info *worker_info = &(para->worker_info);
>>> @@ -306,6 +328,65 @@ do_dma_mem_copy(void *p)
>>>  	return 0;
>>>  }
>>>
>>> +static inline int
>>> +do_dma_sg_mem_copy(void *p)
>>> +{
>>> +	struct lcore_params *para = (struct lcore_params *)p;
>>> +	volatile struct worker_info *worker_info = &(para->worker_info);
>>> +	struct rte_dma_sge *src_sges = para->src_sges;
>>> +	struct rte_dma_sge *dst_sges = para->dst_sges;
>>> +	const uint16_t kick_batch = para->kick_batch;
>>> +	const uint8_t src_ptrs = para->src_ptrs;
>>> +	const uint8_t dst_ptrs = para->dst_ptrs;
>>> +	const uint16_t dev_id = para->dev_id;
>>> +	uint32_t nr_buf = para->nr_buf;
>>> +	uint64_t async_cnt = 0;
>>> +	uint32_t poll_cnt = 0;
>>> +	uint16_t nr_cpl;
>>> +	uint32_t i, j;
>>> +	int ret;
>>> +
>>> +	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
>>> +	worker_info->stop_flag = false;
>>> +	worker_info->ready_flag = true;
>>> +
>>> +	while (!worker_info->start_flag)
>>> +		;
>>> +
>>> +	while (1) {
>>> +		j = 0;
>>> +		for (i = 0; i < nr_buf; i++) {
>>> +dma_copy:
>>> +			ret = rte_dma_copy_sg(dev_id, 0,
>>> +				&src_sges[i * src_ptrs], &dst_sges[j *
>> dst_ptrs],
>>> +				src_ptrs, dst_ptrs, 0);
>>> +			if (unlikely(ret < 0)) {
>>> +				if (ret == -ENOSPC) {
>>> +					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>>> +					goto dma_copy;
>>> +				} else
>>> +					error_exit(dev_id);
>>> +			}
>>> +			async_cnt++;
>>> +			j++;
>>> +
>>> +			if ((async_cnt % kick_batch) == 0)
>>> +				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>>> +		}
>>> +
>>> +		if (worker_info->stop_flag)
>>> +			break;
>>> +	}
>>> +
>>> +	rte_dma_submit(dev_id, 0);
>>> +	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
>>> +		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB,
>> NULL, NULL);
>>> +		async_cnt -= nr_cpl;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  static inline int
>>>  do_cpu_mem_copy(void *p)
>>>  {
>>> @@ -347,8 +428,9 @@ dummy_free_ext_buf(void *addr, void *opaque)
>>>  }
>>>
>>>  static int
>>> -setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
>>> -			struct rte_mbuf ***dsts)
>>> +setup_memory_env(struct test_configure *cfg,
>>> +			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
>>> +			 struct rte_dma_sge **src_sges, struct rte_dma_sge
>> **dst_sges)
>>>  {
>>>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
>>>  	unsigned int cur_buf_size = cfg->buf_size.cur;
>>> @@ -409,8 +491,8 @@ setup_memory_env(struct test_configure *cfg,
>> struct rte_mbuf ***srcs,
>>>  	}
>>>
>>>  	for (i = 0; i < nr_buf; i++) {
>>> -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
>> buf_size);
>>> -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
>>> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(),
>> cur_buf_size);
>>> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0,
>> cur_buf_size);
>>>  	}
>>>
>>>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
>>> @@ -446,20 +528,56 @@ setup_memory_env(struct test_configure *cfg,
>> struct rte_mbuf ***srcs,
>>>  		}
>>>  	}
>>>
>>> +	if (cfg->is_sg) {
>>> +		uint8_t src_ptrs = cfg->src_ptrs;
>>> +		uint8_t dst_ptrs = cfg->dst_ptrs;
>>> +		uint32_t sglen_src, sglen_dst;
>>> +
>>> +		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
>> rte_dma_sge),
>>> +					RTE_CACHE_LINE_SIZE);
>>> +		if (*src_sges == NULL) {
>>> +			printf("Error: src_sges array malloc failed.\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct
>> rte_dma_sge),
>>> +					RTE_CACHE_LINE_SIZE);
>>> +		if (*dst_sges == NULL) {
>>> +			printf("Error: dst_sges array malloc failed.\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		sglen_src = cur_buf_size / src_ptrs;
>>> +		sglen_dst = cur_buf_size / dst_ptrs;
>>> +
>>> +		for (i = 0; i < nr_buf; i++) {
>>> +			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
>>> +			(*src_sges)[i].length = sglen_src;
>>> +			if (!((i+1) % src_ptrs))
>>> +				(*src_sges)[i].length += (cur_buf_size %
>> src_ptrs);
>>> +
>>> +			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
>>> +			(*dst_sges)[i].length = sglen_dst;
>>> +			if (!((i+1) % dst_ptrs))
>>> +				(*dst_sges)[i].length += (cur_buf_size %
>> dst_ptrs);
>>> +		}
>>> +	}
>>> +
>>>  	return 0;
>>>  }
>>>
>>>  int
>>> -mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
>>> +mem_copy_benchmark(struct test_configure *cfg)
>>>  {
>>> -	uint32_t i;
>>> +	uint32_t i, j;
>>>  	uint32_t offset;
>>>  	unsigned int lcore_id = 0;
>>>  	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
>>> +	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
>>>  	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
>>> +	const uint32_t mcore_id = rte_get_main_lcore();
>>>  	unsigned int buf_size = cfg->buf_size.cur;
>>>  	uint16_t kick_batch = cfg->kick_batch.cur;
>>> -	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) /
>> (cfg->buf_size.cur * 2);
>>>  	uint16_t nb_workers = ldm->cnt;
>>>  	uint16_t test_secs = cfg->test_secs;
>>>  	float memory = 0;
>>> @@ -467,12 +585,32 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  	uint32_t avg_cycles_total;
>>>  	float mops, mops_total;
>>>  	float bandwidth, bandwidth_total;
>>> +	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
>>> +	uint32_t nr_buf;
>>>  	int ret = 0;
>>>
>>> -	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
>>> +	/* Align number of buffers according to workers count */
>>> +	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
>>> +	nr_buf -= (nr_buf % nb_workers);
>>> +	if (cfg->is_sg) {
>>> +		nr_buf /= nb_workers;
>>> +		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
>>> +		nr_buf *= nb_workers;
>>> +
>>> +		if (cfg->dst_ptrs > cfg->src_ptrs) {
>>> +			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
>>> +			nr_sgdst = nr_buf;
>>> +		} else {
>>> +			nr_sgsrc = nr_buf;
>>> +			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
>>> +		}
>>> +	}
>>
>> pls move to a subfunction
> Ack.
> 
>>
>>> +
>>> +	cfg->nr_buf = nr_buf;
>>> +	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
>>>  		goto out;
>>>
>>> -	if (is_dma)
>>> +	if (cfg->is_dma)
>>>  		if (config_dmadevs(cfg) < 0)
>>>  			goto out;
>>>
>>> @@ -486,13 +624,23 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>
>>>  	for (i = 0; i < nb_workers; i++) {
>>>  		lcore_id = ldm->lcores[i];
>>> +		if (lcore_id == mcore_id) {
>>> +			printf("lcore parameters can not use main core id
>> %d\n", mcore_id);
>>> +			goto out;
>>> +		}
>>> +
>>> +		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
>>> +			printf("lcore parameters can not use offline core id
>> %d\n", lcore_id);
>>> +			goto out;
>>> +		}
>>
>> The above two judgement should in a seperate commit.
> 
> Sorry, somehow it got mixed from different patch I had in my local repo.
> It will be in different commit.
> 
>>
>>> +
>>>  		offset = nr_buf / nb_workers * i;
>>>  		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
>>>  		if (lcores[i] == NULL) {
>>>  			printf("lcore parameters malloc failure for lcore %d\n",
>> lcore_id);
>>>  			break;
>>>  		}
>>> -		if (is_dma) {
>>> +		if (cfg->is_dma) {
>>>  			lcores[i]->dma_name = ldm->dma_names[i];
>>>  			lcores[i]->dev_id = ldm->dma_ids[i];
>>>  			lcores[i]->kick_batch = kick_batch;
>>> @@ -506,10 +654,23 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  		lcores[i]->scenario_id = cfg->scenario_id;
>>>  		lcores[i]->lcore_id = lcore_id;
>>>
>>> -		if (is_dma)
>>> -			rte_eal_remote_launch(do_dma_mem_copy, (void
>> *)(lcores[i]), lcore_id);
>>> -		else
>>> +		if (cfg->is_sg) {
>>> +			lcores[i]->src_ptrs = cfg->src_ptrs;
>>> +			lcores[i]->dst_ptrs = cfg->dst_ptrs;
>>> +			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers
>> * i);
>>> +			lcores[i]->dst_sges = dst_sges + (nr_sgdst /
>> nb_workers * i);
>>> +		}
>>> +
>>> +		if (cfg->is_dma) {
>>> +			if (!cfg->is_sg)
>>> +
>> 	rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
>>> +					lcore_id);
>>> +			else
>>> +
>> 	rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
>>> +					lcore_id);
>>> +		} else {
>>>  			rte_eal_remote_launch(do_cpu_mem_copy, (void
>> *)(lcores[i]), lcore_id);
>>> +		}
>>
>> too many judgement for selecting target function, how about wrap it
>> subfunction:
>> lcore_function_t get_work_function(struct test_configure *cfg)
>> then rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
>> lcore_id);
>>
> Ack.
> 
>>>  	}
>>>
>>>  	while (1) {
>>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>
>>>  	rte_eal_mp_wait_lcore();
>>>
>>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>> -			   rte_pktmbuf_mtod(dsts[i], void *),
>>> -			   cfg->buf_size.cur) != 0) {
>>> -			printf("Copy validation fails for buffer number %d\n",
>> i);
>>> -			ret = -1;
>>> -			goto out;
>>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
>> {
>>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>> +					rte_pktmbuf_mtod(dsts[i], void *),
>>> +					cfg->buf_size.cur) != 0) {
>>> +				printf("Copy validation fails for buffer number
>> %d\n", i);
>>> +				ret = -1;
>>> +				goto out;
>>> +			}
>>> +		}
>>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
>> RTE_DMA_DIR_MEM_TO_MEM) {
>>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
>>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
>>> +		size_t src_sz = buf_size / cfg->src_ptrs;
>>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
>>> +		uint8_t src[buf_size], dst[buf_size];
>>> +		uint8_t *sbuf, *dbuf, *ptr;
>>> +
>>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
>> i++) {
>>> +			sbuf = src;
>>> +			dbuf = dst;
>>> +			ptr = NULL;
>>> +
>>> +			for (j = 0; j < cfg->src_ptrs; j++) {
>>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
>> + j], uint8_t *);
>>> +				memcpy(sbuf, ptr, src_sz);
>>> +				sbuf += src_sz;
>>> +			}
>>> +
>>> +			if (src_remsz)
>>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
>>> +
>>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
>>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
>> + j], uint8_t *);
>>> +				memcpy(dbuf, ptr, dst_sz);
>>> +				dbuf += dst_sz;
>>> +			}
>>> +
>>> +			if (dst_remsz)
>>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
>>> +
>>> +			if (memcmp(src, dst, buf_size) != 0) {
>>> +				printf("SG Copy validation fails for buffer
>> number %d\n",
>>> +					i * cfg->src_ptrs);
>>> +				ret = -1;
>>> +				goto out;
>>> +			}
>>
>> Now I doubt the value of verify, this verify can't find the middle round copy
>> failure,
>> because as long as the last round copy is successful, the validation will pass.
>>
> Validation is on entire buffer. If any middle copy is a failure, entire memcmp
> would have failed. Or do I miss something ?
> 
>> And adding validatation in every round copy will impact performance.
>>
> This validation is just after worker function is stopped measuring perf.
> How would this impact performance ?

Yes, it will don't impact performance.

What I said before is that is not valid, pls consider following scene:


	while (1) {
		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs, let's defind this is a round copy.
dma_copy:
			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
			if (unlikely(ret < 0)) {
				if (ret == -ENOSPC) {
					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
					goto dma_copy;
				} else
					error_exit(dev_id);
			}
			async_cnt++;

			if ((async_cnt % kick_batch) == 0)
				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
		}

		if (worker_info->stop_flag)   // if don't stop, it will do many round copies.
			break;
	}

and the later validation just verify the last round, let's assume there are 100 round, and if the last round copy
work well, but round 0~98 both copy fail, then the validation will not detect it.


So if we want do all the validation, then we should add the velidation after every round copy, but it will
impact the performance.


> 
>> Also app/test_dmadev already verify data. so I think we should drop the
>> validation commit.
> 
> Even in some corner cases or unknown issues, copy would have failed
> and taking perf cycles then is meaningless. That is the reason, this validation
> is added after perf function doing its job.

How about:

	while (1) {
		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs, let's defind this is a round copy.
dma_copy:
			ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
			if (unlikely(ret < 0)) {
				if (ret == -ENOSPC) {
					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
					goto dma_copy;
				} else
					error_exit(dev_id);
			}
			async_cnt++;

			if ((async_cnt % kick_batch) == 0)
				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
		}

		if (unlikely(work_info->verify)) {
			ret = verify();
			if (ret != 0) {
				// error trace,
				break;
			}
		}

		if (worker_info->stop_flag)   // if don't stop, it will do many round copies.
			break;
	}

and make this verify as a config entry

> 
>>
>>>  		}
>>>  	}
>>>
>>> @@ -558,10 +759,8 @@ mem_copy_benchmark(struct test_configure *cfg,
>> bool is_dma)
>>>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>>>  			lcores[i]->worker_info.test_cpl,
>>>  			&memory, &avg_cycles, &bandwidth, &mops);
>>> -		output_result(cfg->scenario_id, lcores[i]->lcore_id,
>>> -					lcores[i]->dma_name, cfg-
>>> ring_size.cur, kick_batch,
>>> -					avg_cycles, buf_size, nr_buf /
>> nb_workers, memory,
>>> -					bandwidth, mops, is_dma);
>>> +		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
>>> +			nr_buf / nb_workers, memory, bandwidth, mops);
>>>  		mops_total += mops;
>>>  		bandwidth_total += bandwidth;
>>>  		avg_cycles_total += avg_cycles;
>>> @@ -604,13 +803,20 @@ mem_copy_benchmark(struct test_configure
>> *cfg, bool is_dma)
>>>  	rte_mempool_free(dst_pool);
>>>  	dst_pool = NULL;
>>>
>>> +	/* free sges for mbufs */
>>> +	rte_free(src_sges);
>>> +	src_sges = NULL;
>>> +
>>> +	rte_free(dst_sges);
>>> +	dst_sges = NULL;
>>> +
>>>  	/* free the worker parameters */
>>>  	for (i = 0; i < nb_workers; i++) {
>>>  		rte_free(lcores[i]);
>>>  		lcores[i] = NULL;
>>>  	}
>>>
>>> -	if (is_dma) {
>>> +	if (cfg->is_dma) {
>>>  		for (i = 0; i < nb_workers; i++) {
>>>  			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
>>>  			rte_dma_stop(ldm->dma_ids[i]);
>>> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
>>> index 9c8221025e..28f6c9d1db 100644
>>> --- a/app/test-dma-perf/config.ini
>>> +++ b/app/test-dma-perf/config.ini
>>> @@ -38,6 +38,14 @@
>>>
>>>  ; "skip" To skip a test-case set skip to 1.
>>
>> Please place hese patchset new add entrys' descriptions above the
>> "; To specify a configuration file, use the "--config" flag followed by the path to
>> the file."
>>
>> because original config.ini, fist is parameters descriptor, and then program
>> argment descriptor, and last was example.
>>
> Ack.
>>>
>>> +; Parameters to be configured for SG copy:
>>
>> Parameters for DMA scatter-gather memory copy:
>>
> Ack.
>>> +; ========================================
>>
>> Please remove this line
>>
> Ack.
>>> +; "dma_src_sge" denotes number of source segments.
>>> +; "dma_dst_sge" denotes number of destination segments.
>>> +;
>>> +; For SG copy, both the parameters need to be configured and they are valid
>> only
>>> +; when type is DMA_MEM_COPY.
>>
>> For DMA scatter-gather memory copy, the parameters need to be configured
>> and they are valid only
>> when type is DMA_MEM_COPY.
>>
> Ack.
>>> +;
>>>  ; Parameters to be configured for data transfers from "mem to dev" and
>> "dev to mem":
>>>  ;
>> ===================================================================
>> ===============
>>
>> Please remove this line
>>
>> As another commit "Re: [PATCH v2] app/dma-perf: support bi-directional
>> transfer"'s review feedback,
>> these descriptor should place after
>> "
>> ; To use DMA for a test, please specify the "lcore_dma" parameter.
>> ; If you have already set the "-l" and "-a" parameters using EAL,
>> ; make sure that the value of "lcore_dma" falls within their range of the values.
>> ; We have to ensure a 1:1 mapping between the core and DMA device.
>> "
>>
>>
>>>  ; "direction" denotes the direction of data transfer. It can take 3 values:
>>> @@ -69,6 +77,21 @@ lcore_dma=lcore10@0000:00:04.2,
>> lcore11@0000:00:04.3
>>>  eal_args=--in-memory --file-prefix=test
>>>
>>>  [case2]
>>> +type=DMA_MEM_COPY
>>> +mem_size=10
>>> +buf_size=64,8192,2,MUL
>>> +dma_ring_size=1024
>>> +dma_src_sge=4
>>> +dma_dst_sge=1
>>> +kick_batch=32
>>> +src_numa_node=0
>>> +dst_numa_node=0
>>> +cache_flush=0
>>> +test_seconds=2
>>> +lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>>> +eal_args=--in-memory --file-prefix=test
>>> +
>>> +[case3]
>>>  skip=1
>>>  type=DMA_MEM_COPY
>>>  direction=dev2mem
>>> @@ -84,7 +107,7 @@ test_seconds=2
>>>  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
>>>  eal_args=--in-memory --file-prefix=test
>>>
>>> -[case3]
>>> +[case4]
>>>  type=CPU_MEM_COPY
>>>  mem_size=10
>>>  buf_size=64,8192,2,MUL
>>> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
>>> index df05bcd7df..a27e4c9429 100644
>>> --- a/app/test-dma-perf/main.c
>>> +++ b/app/test-dma-perf/main.c
>>> @@ -108,10 +108,8 @@ run_test_case(struct test_configure *case_cfg)
>>>
>>>  	switch (case_cfg->test_type) {
>>>  	case TEST_TYPE_DMA_MEM_COPY:
>>> -		ret = mem_copy_benchmark(case_cfg, true);
>>> -		break;
>>>  	case TEST_TYPE_CPU_MEM_COPY:
>>> -		ret = mem_copy_benchmark(case_cfg, false);
>>> +		ret = mem_copy_benchmark(case_cfg);
>>>  		break;
>>>  	default:
>>>  		printf("Unknown test type. %s\n", case_cfg->test_type_str);
>>> @@ -365,7 +363,8 @@ load_configs(const char *path)
>>>  	const char *case_type;
>>>  	const char *transfer_dir;
>>>  	const char *lcore_dma;
>>> -	const char *mem_size_str, *buf_size_str, *ring_size_str,
>> *kick_batch_str;
>>> +	const char *mem_size_str, *buf_size_str, *ring_size_str,
>> *kick_batch_str,
>>> +		*src_ptrs_str, *dst_ptrs_str;
>>>  	const char *skip;
>>>  	struct rte_kvargs *kvlist;
>>>  	const char *vchan_dev;
>>> @@ -467,6 +466,7 @@ load_configs(const char *path)
>>>  			rte_kvargs_free(kvlist);
>>>  		}
>>>
>>> +		test_case->is_dma = is_dma;
>>>  		test_case->src_numa_node =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>>
>> 	section_name, "src_numa_node"));
>>>  		test_case->dst_numa_node =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> @@ -501,6 +501,32 @@ load_configs(const char *path)
>>>  			} else if (args_nr == 4)
>>>  				nb_vp++;
>>>
>>> +			src_ptrs_str = rte_cfgfile_get_entry(cfgfile,
>> section_name,
>>> +
>> 	"dma_src_sge");
>>> +			if (src_ptrs_str != NULL) {
>>> +				test_case->src_ptrs =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> +
>> 	section_name, "dma_src_sge"));
>>> +			}
>>> +
>>> +			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile,
>> section_name,
>>> +
>> 	"dma_dst_sge");
>>> +			if (dst_ptrs_str != NULL) {
>>> +				test_case->dst_ptrs =
>> (int)atoi(rte_cfgfile_get_entry(cfgfile,
>>> +
>> 	section_name, "dma_dst_sge"));
>>> +			}
>>> +
>>> +			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
>>> +			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
>>
>> Please also check test_case->src_ptrs and test_case->dst_ptrs valid, make sure
>> there are >1 and <=UINT16_MAX
> 
> At present, this is uint8_t. Do we need it more than UINT8_MAX ?

ok

> 
>>
>>> +				printf("parse dma_src_sge, dma_dst_sge error
>> in case %d.\n",
>>> +					i + 1);
>>> +				test_case->is_valid = false;
>>> +				continue;
>>> +			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL)
>> {
>>> +				test_case->is_sg = true;
>>> +			} else {
>>> +				test_case->is_sg = false;
>>
>> the above could simple by: test_case->is_sg = (src_ptrs_str != NULL &&
>> dst_ptrs_str != NULL);
>>
> Added check for nb_ validation here. Please check in next version of patch.
>>> +			}
>>> +
>>>  			kick_batch_str = rte_cfgfile_get_entry(cfgfile,
>> section_name, "kick_batch");
>>>  			args_nr = parse_entry(kick_batch_str, &test_case-
>>> kick_batch);
>>>  			if (args_nr < 0) {
>>> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
>>> index 1123e7524a..baf149b72b 100644
>>> --- a/app/test-dma-perf/main.h
>>> +++ b/app/test-dma-perf/main.h
>>> @@ -53,11 +53,14 @@ struct test_configure {
>>>  	uint16_t dst_numa_node;
>>>  	uint16_t opcode;
>>>  	bool is_dma;
>>> +	bool is_sg;
>>>  	struct lcore_dma_map_t lcore_dma_map;
>>>  	struct test_configure_entry mem_size;
>>>  	struct test_configure_entry buf_size;
>>>  	struct test_configure_entry ring_size;
>>>  	struct test_configure_entry kick_batch;
>>> +	uint8_t src_ptrs;
>>> +	uint8_t dst_ptrs;
>>>  	uint8_t cache_flush;
>>>  	uint32_t nr_buf;
>>>  	uint16_t test_secs;
>>> @@ -66,6 +69,6 @@ struct test_configure {
>>>  	struct test_vchan_dev_config vchan_dev;
>>>  };
>>>
>>> -int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
>>> +int mem_copy_benchmark(struct test_configure *cfg);
>>>
>>>  #endif /* MAIN_H */
>>>
> 
> Thank you for your review. Please confirm if there are any other changes
> and I hope next version goes through 😊
> 
> Regards,
> Gowrishankar
>
  
Gowrishankar Muthukrishnan March 1, 2024, 8:06 a.m. UTC | #4
Hi Fengcheng,

<cut>
> >>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char
> >>> *dma_name,
> >> uint16_t ring_size,
> >>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> >> buf_size, uint32_t nr_buf,
> >>> -			float memory, float bandwidth, float mops, bool
> >> is_dma)
> >>> +output_result(struct test_configure *cfg, struct lcore_params *para,
> >>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
> >> buf_size,
> >>> +			uint32_t nr_buf, float memory, float bandwidth, float
> >> mops)
> >>>  {
> >>> -	if (is_dma)
> >>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> >> %u.\n",
> >>> -				lcore_id, dma_name, ring_size, kick_batch);
> >>> -	else
> >>> +	uint16_t ring_size = cfg->ring_size.cur;
> >>> +	uint8_t scenario_id = cfg->scenario_id;
> >>> +	uint32_t lcore_id = para->lcore_id;
> >>> +	char *dma_name = para->dma_name;
> >>> +
> >>> +	if (cfg->is_dma) {
> >>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
> >> %u", lcore_id,
> >>> +		       dma_name, ring_size, kick_batch);
> >>> +		if (cfg->is_sg)
> >>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
> >>> +			       para->src_ptrs, para->dst_ptrs);
> >>
> >> DMA src sges: %u DMA dst sges: %u
> >>
> >> I think we should add a column which title maybe misc, some like
> >> sg-src[4]- dst[1], and later we may add fill test, then this field
> >> could be pattern-
> >> 0x12345678
> >>
> >> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit,
> >> if the DMA was worked in non-mem2mem direction, we could add simple
> >> descriptor of direction and pcie.info in the above misc column.
> >>
> >
> > I am sorry, I could not understand complete picture here. Do you mean
> > we reserve a column and use it as per test type.
> >
> > For plain mem copy, nothing added.
> > For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print
> "sg-src[1]-dst[4]".
> > In future, when we add fill test in benchmark, this line instead be "pattern-
> 0x12345678".
> >
> > Is my understanding correct over here ?
> 
> Yes, some like this.
> 
This patch adds SGE info in an alignment with existing output.

I think it is better to add further extensions as we add new features. Since the app doesn't support the features that you mentioned, it is difficult to anticipate the requirements.
In fact, if the additional frameworks that we put in are not useful for those features, it could lead to stale code.
I would prefer if we can make these changes as we add new features.

> >
<cut>
> >>>  	}
> >>>
> >>>  	while (1) {
> >>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
> >> *cfg, bool is_dma)
> >>>
> >>>  	rte_eal_mp_wait_lcore();
> >>>
> >>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> >>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> >>> -			   rte_pktmbuf_mtod(dsts[i], void *),
> >>> -			   cfg->buf_size.cur) != 0) {
> >>> -			printf("Copy validation fails for buffer number %d\n",
> >> i);
> >>> -			ret = -1;
> >>> -			goto out;
> >>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
> >> {
> >>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
> >>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
> >>> +					rte_pktmbuf_mtod(dsts[i], void *),
> >>> +					cfg->buf_size.cur) != 0) {
> >>> +				printf("Copy validation fails for buffer number
> >> %d\n", i);
> >>> +				ret = -1;
> >>> +				goto out;
> >>> +			}
> >>> +		}
> >>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
> >> RTE_DMA_DIR_MEM_TO_MEM) {
> >>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
> >>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
> >>> +		size_t src_sz = buf_size / cfg->src_ptrs;
> >>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
> >>> +		uint8_t src[buf_size], dst[buf_size];
> >>> +		uint8_t *sbuf, *dbuf, *ptr;
> >>> +
> >>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
> >> i++) {
> >>> +			sbuf = src;
> >>> +			dbuf = dst;
> >>> +			ptr = NULL;
> >>> +
> >>> +			for (j = 0; j < cfg->src_ptrs; j++) {
> >>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
> >> + j], uint8_t *);
> >>> +				memcpy(sbuf, ptr, src_sz);
> >>> +				sbuf += src_sz;
> >>> +			}
> >>> +
> >>> +			if (src_remsz)
> >>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
> >>> +
> >>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
> >>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
> >> + j], uint8_t *);
> >>> +				memcpy(dbuf, ptr, dst_sz);
> >>> +				dbuf += dst_sz;
> >>> +			}
> >>> +
> >>> +			if (dst_remsz)
> >>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
> >>> +
> >>> +			if (memcmp(src, dst, buf_size) != 0) {
> >>> +				printf("SG Copy validation fails for buffer
> >> number %d\n",
> >>> +					i * cfg->src_ptrs);
> >>> +				ret = -1;
> >>> +				goto out;
> >>> +			}
> >>
> >> Now I doubt the value of verify, this verify can't find the middle
> >> round copy failure, because as long as the last round copy is
> >> successful, the validation will pass.
> >>
> > Validation is on entire buffer. If any middle copy is a failure,
> > entire memcmp would have failed. Or do I miss something ?
> >
> >> And adding validatation in every round copy will impact performance.
> >>
> > This validation is just after worker function is stopped measuring perf.
> > How would this impact performance ?
> 
> Yes, it will don't impact performance.
> 
> What I said before is that is not valid, pls consider following scene:
> 
> 
> 	while (1) {
> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
> let's defind this is a round copy.
> dma_copy:
> 			ret = rte_dma_copy(dev_id, 0,
> rte_mbuf_data_iova(srcs[i]),
> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> 			if (unlikely(ret < 0)) {
> 				if (ret == -ENOSPC) {
> 					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 					goto dma_copy;
> 				} else
> 					error_exit(dev_id);
> 			}
> 			async_cnt++;
> 
> 			if ((async_cnt % kick_batch) == 0)
> 				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 		}
> 
> 		if (worker_info->stop_flag)   // if don't stop, it will do many
> round copies.
> 			break;
> 	}
> 
> and the later validation just verify the last round, let's assume there are 100
> round, and if the last round copy work well, but round 0~98 both copy fail,
> then the validation will not detect it.
> 
> 
> So if we want do all the validation, then we should add the velidation after
> every round copy, but it will impact the performance.
> 
> 
> >
> >> Also app/test_dmadev already verify data. so I think we should drop
> >> the validation commit.
> >
> > Even in some corner cases or unknown issues, copy would have failed
> > and taking perf cycles then is meaningless. That is the reason, this
> > validation is added after perf function doing its job.
> 
> How about:
> 
> 	while (1) {
> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
> let's defind this is a round copy.
> dma_copy:
> 			ret = rte_dma_copy(dev_id, 0,
> rte_mbuf_data_iova(srcs[i]),
> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
> 			if (unlikely(ret < 0)) {
> 				if (ret == -ENOSPC) {
> 					do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 					goto dma_copy;
> 				} else
> 					error_exit(dev_id);
> 			}
> 			async_cnt++;
> 
> 			if ((async_cnt % kick_batch) == 0)
> 				do_dma_submit_and_poll(dev_id,
> &async_cnt, worker_info);
> 		}
> 
> 		if (unlikely(work_info->verify)) {
> 			ret = verify();
> 			if (ret != 0) {
> 				// error trace,
> 				break;
> 			}
> 		}
> 
> 		if (worker_info->stop_flag)   // if don't stop, it will do many
> round copies.
> 			break;
> 	}
> 
> and make this verify as a config entry

I believe there is a difference in understanding of what this is intended to do. Intention here is not to validate every operation done by DMA, and that is already taken care by UT.

Is it possible that we are we misreporting numbers if the application is buggy or PMD is misbehaving for the scenario under test and the copies are not actually performed? Yes. Think about a scenario where PMD is buggy when trying bursts of more than 1.

Checking last set of buffers is more like testing a sample from the perf test to make sure perf test was indeed performing what it is claiming to do. If you think it is unnecessary to do so, we can drop this from upstream. But adding complete verification in performance app would be repeating what a unit test is expected to do. I would suggest not to do that.

Thanks,
Gowrishankar
  
Chengwen Feng March 1, 2024, 9:45 a.m. UTC | #5
On 2024/3/1 16:06, Gowrishankar Muthukrishnan wrote:
> Hi Fengcheng,
> 
> <cut>
>>>>> -output_result(uint8_t scenario_id, uint32_t lcore_id, char
>>>>> *dma_name,
>>>> uint16_t ring_size,
>>>>> -			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>>>> buf_size, uint32_t nr_buf,
>>>>> -			float memory, float bandwidth, float mops, bool
>>>> is_dma)
>>>>> +output_result(struct test_configure *cfg, struct lcore_params *para,
>>>>> +			uint16_t kick_batch, uint64_t ave_cycle, uint32_t
>>>> buf_size,
>>>>> +			uint32_t nr_buf, float memory, float bandwidth, float
>>>> mops)
>>>>>  {
>>>>> -	if (is_dma)
>>>>> -		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>>>> %u.\n",
>>>>> -				lcore_id, dma_name, ring_size, kick_batch);
>>>>> -	else
>>>>> +	uint16_t ring_size = cfg->ring_size.cur;
>>>>> +	uint8_t scenario_id = cfg->scenario_id;
>>>>> +	uint32_t lcore_id = para->lcore_id;
>>>>> +	char *dma_name = para->dma_name;
>>>>> +
>>>>> +	if (cfg->is_dma) {
>>>>> +		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size:
>>>> %u", lcore_id,
>>>>> +		       dma_name, ring_size, kick_batch);
>>>>> +		if (cfg->is_sg)
>>>>> +			printf(" DMA src ptrs: %u, dst ptrs: %u",
>>>>> +			       para->src_ptrs, para->dst_ptrs);
>>>>
>>>> DMA src sges: %u DMA dst sges: %u
>>>>
>>>> I think we should add a column which title maybe misc, some like
>>>> sg-src[4]- dst[1], and later we may add fill test, then this field
>>>> could be pattern-
>>>> 0x12345678
>>>>
>>>> And in "[PATCH v10 2/4] app/dma-perf: add PCI device support" commit,
>>>> if the DMA was worked in non-mem2mem direction, we could add simple
>>>> descriptor of direction and pcie.info in the above misc column.
>>>>
>>>
>>> I am sorry, I could not understand complete picture here. Do you mean
>>> we reserve a column and use it as per test type.
>>>
>>> For plain mem copy, nothing added.
>>> For SG mem copy, instead of showing "DMA src sges: 1, dst sges: 4", print
>> "sg-src[1]-dst[4]".
>>> In future, when we add fill test in benchmark, this line instead be "pattern-
>> 0x12345678".
>>>
>>> Is my understanding correct over here ?
>>
>> Yes, some like this.
>>
> This patch adds SGE info in an alignment with existing output.
> 
> I think it is better to add further extensions as we add new features. Since the app doesn't support the features that you mentioned, it is difficult to anticipate the requirements.
> In fact, if the additional frameworks that we put in are not useful for those features, it could lead to stale code.
> I would prefer if we can make these changes as we add new features.
> 
>>>
> <cut>
>>>>>  	}
>>>>>
>>>>>  	while (1) {
>>>>> @@ -541,13 +702,53 @@ mem_copy_benchmark(struct test_configure
>>>> *cfg, bool is_dma)
>>>>>
>>>>>  	rte_eal_mp_wait_lcore();
>>>>>
>>>>> -	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>>>> -		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>>>> -			   rte_pktmbuf_mtod(dsts[i], void *),
>>>>> -			   cfg->buf_size.cur) != 0) {
>>>>> -			printf("Copy validation fails for buffer number %d\n",
>>>> i);
>>>>> -			ret = -1;
>>>>> -			goto out;
>>>>> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM)
>>>> {
>>>>> +		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>>>>> +			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>>>>> +					rte_pktmbuf_mtod(dsts[i], void *),
>>>>> +					cfg->buf_size.cur) != 0) {
>>>>> +				printf("Copy validation fails for buffer number
>>>> %d\n", i);
>>>>> +				ret = -1;
>>>>> +				goto out;
>>>>> +			}
>>>>> +		}
>>>>> +	} else if (cfg->is_sg && cfg->transfer_dir ==
>>>> RTE_DMA_DIR_MEM_TO_MEM) {
>>>>> +		size_t src_remsz = buf_size % cfg->src_ptrs;
>>>>> +		size_t dst_remsz = buf_size % cfg->dst_ptrs;
>>>>> +		size_t src_sz = buf_size / cfg->src_ptrs;
>>>>> +		size_t dst_sz = buf_size / cfg->dst_ptrs;
>>>>> +		uint8_t src[buf_size], dst[buf_size];
>>>>> +		uint8_t *sbuf, *dbuf, *ptr;
>>>>> +
>>>>> +		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs));
>>>> i++) {
>>>>> +			sbuf = src;
>>>>> +			dbuf = dst;
>>>>> +			ptr = NULL;
>>>>> +
>>>>> +			for (j = 0; j < cfg->src_ptrs; j++) {
>>>>> +				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs
>>>> + j], uint8_t *);
>>>>> +				memcpy(sbuf, ptr, src_sz);
>>>>> +				sbuf += src_sz;
>>>>> +			}
>>>>> +
>>>>> +			if (src_remsz)
>>>>> +				memcpy(sbuf, ptr + src_sz, src_remsz);
>>>>> +
>>>>> +			for (j = 0; j < cfg->dst_ptrs; j++) {
>>>>> +				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs
>>>> + j], uint8_t *);
>>>>> +				memcpy(dbuf, ptr, dst_sz);
>>>>> +				dbuf += dst_sz;
>>>>> +			}
>>>>> +
>>>>> +			if (dst_remsz)
>>>>> +				memcpy(dbuf, ptr + dst_sz, dst_remsz);
>>>>> +
>>>>> +			if (memcmp(src, dst, buf_size) != 0) {
>>>>> +				printf("SG Copy validation fails for buffer
>>>> number %d\n",
>>>>> +					i * cfg->src_ptrs);
>>>>> +				ret = -1;
>>>>> +				goto out;
>>>>> +			}
>>>>
>>>> Now I doubt the value of verify, this verify can't find the middle
>>>> round copy failure, because as long as the last round copy is
>>>> successful, the validation will pass.
>>>>
>>> Validation is on entire buffer. If any middle copy is a failure,
>>> entire memcmp would have failed. Or do I miss something ?
>>>
>>>> And adding validatation in every round copy will impact performance.
>>>>
>>> This validation is just after worker function is stopped measuring perf.
>>> How would this impact performance ?
>>
>> Yes, it will don't impact performance.
>>
>> What I said before is that is not valid, pls consider following scene:
>>
>>
>> 	while (1) {
>> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
>> let's defind this is a round copy.
>> dma_copy:
>> 			ret = rte_dma_copy(dev_id, 0,
>> rte_mbuf_data_iova(srcs[i]),
>> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> 			if (unlikely(ret < 0)) {
>> 				if (ret == -ENOSPC) {
>> 					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 					goto dma_copy;
>> 				} else
>> 					error_exit(dev_id);
>> 			}
>> 			async_cnt++;
>>
>> 			if ((async_cnt % kick_batch) == 0)
>> 				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 		}
>>
>> 		if (worker_info->stop_flag)   // if don't stop, it will do many
>> round copies.
>> 			break;
>> 	}
>>
>> and the later validation just verify the last round, let's assume there are 100
>> round, and if the last round copy work well, but round 0~98 both copy fail,
>> then the validation will not detect it.
>>
>>
>> So if we want do all the validation, then we should add the velidation after
>> every round copy, but it will impact the performance.
>>
>>
>>>
>>>> Also app/test_dmadev already verify data. so I think we should drop
>>>> the validation commit.
>>>
>>> Even in some corner cases or unknown issues, copy would have failed
>>> and taking perf cycles then is meaningless. That is the reason, this
>>> validation is added after perf function doing its job.
>>
>> How about:
>>
>> 	while (1) {
>> 		for (i = 0; i < nr_buf; i++) {  // this for loop will copy all nr_bufs,
>> let's defind this is a round copy.
>> dma_copy:
>> 			ret = rte_dma_copy(dev_id, 0,
>> rte_mbuf_data_iova(srcs[i]),
>> 				rte_mbuf_data_iova(dsts[i]), buf_size, 0);
>> 			if (unlikely(ret < 0)) {
>> 				if (ret == -ENOSPC) {
>> 					do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 					goto dma_copy;
>> 				} else
>> 					error_exit(dev_id);
>> 			}
>> 			async_cnt++;
>>
>> 			if ((async_cnt % kick_batch) == 0)
>> 				do_dma_submit_and_poll(dev_id,
>> &async_cnt, worker_info);
>> 		}
>>
>> 		if (unlikely(work_info->verify)) {
>> 			ret = verify();
>> 			if (ret != 0) {
>> 				// error trace,
>> 				break;
>> 			}
>> 		}
>>
>> 		if (worker_info->stop_flag)   // if don't stop, it will do many
>> round copies.
>> 			break;
>> 	}
>>
>> and make this verify as a config entry
> 
> I believe there is a difference in understanding of what this is intended to do. Intention here is not to validate every operation done by DMA, and that is already taken care by UT.
> 
> Is it possible that we are we misreporting numbers if the application is buggy or PMD is misbehaving for the scenario under test and the copies are not actually performed? Yes. Think about a scenario where PMD is buggy when trying bursts of more than 1.
> 
> Checking last set of buffers is more like testing a sample from the perf test to make sure perf test was indeed performing what it is claiming to do. If you think it is unnecessary to do so, we can drop this from upstream. But adding complete verification in performance app would be repeating what a unit test is expected to do. I would suggest not to do that.

I think this commit is mainly test the dma-perf itself.

OK with continue this commit.

Thanks

> 
> Thanks,
> Gowrishankar
>
  

Patch

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 0047e2f4b8..25ed6fa6d0 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -46,6 +46,10 @@  struct lcore_params {
 	uint16_t test_secs;
 	struct rte_mbuf **srcs;
 	struct rte_mbuf **dsts;
+	struct rte_dma_sge *src_sges;
+	struct rte_dma_sge *dst_sges;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	volatile struct worker_info worker_info;
 };
 
@@ -86,21 +90,31 @@  calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t te
 }
 
 static void
-output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
-			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
-			float memory, float bandwidth, float mops, bool is_dma)
+output_result(struct test_configure *cfg, struct lcore_params *para,
+			uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size,
+			uint32_t nr_buf, float memory, float bandwidth, float mops)
 {
-	if (is_dma)
-		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
-				lcore_id, dma_name, ring_size, kick_batch);
-	else
+	uint16_t ring_size = cfg->ring_size.cur;
+	uint8_t scenario_id = cfg->scenario_id;
+	uint32_t lcore_id = para->lcore_id;
+	char *dma_name = para->dma_name;
+
+	if (cfg->is_dma) {
+		printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u", lcore_id,
+		       dma_name, ring_size, kick_batch);
+		if (cfg->is_sg)
+			printf(" DMA src ptrs: %u, dst ptrs: %u",
+			       para->src_ptrs, para->dst_ptrs);
+		printf(".\n");
+	} else {
 		printf("lcore %u\n", lcore_id);
+	}
 
 	printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
 			ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
 	printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
 
-	if (is_dma)
+	if (cfg->is_dma)
 		snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
 			scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
 			nr_buf, memory, ave_cycle, bandwidth, mops);
@@ -167,7 +181,7 @@  vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
@@ -190,6 +204,10 @@  configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg)
 		rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
 				dev_id);
 
+	if (info.max_sges < ptrs_max)
+		rte_exit(EXIT_FAILURE, "Error, DMA ptrs more than supported by device id %u.\n",
+				dev_id);
+
 	if (rte_dma_start(dev_id) != 0)
 		rte_exit(EXIT_FAILURE, "Error with dma start.\n");
 }
@@ -202,8 +220,12 @@  config_dmadevs(struct test_configure *cfg)
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
+	uint8_t ptrs_max = 0;
 	char *dma_name;
 
+	if (cfg->is_sg)
+		ptrs_max = RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs);
+
 	for (i = 0; i < ldm->cnt; i++) {
 		dma_name = ldm->dma_names[i];
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
@@ -213,7 +235,7 @@  config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max);
 		++nb_dmadevs;
 	}
 
@@ -253,7 +275,7 @@  do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
 }
 
 static inline int
-do_dma_mem_copy(void *p)
+do_dma_plain_mem_copy(void *p)
 {
 	struct lcore_params *para = (struct lcore_params *)p;
 	volatile struct worker_info *worker_info = &(para->worker_info);
@@ -306,6 +328,65 @@  do_dma_mem_copy(void *p)
 	return 0;
 }
 
+static inline int
+do_dma_sg_mem_copy(void *p)
+{
+	struct lcore_params *para = (struct lcore_params *)p;
+	volatile struct worker_info *worker_info = &(para->worker_info);
+	struct rte_dma_sge *src_sges = para->src_sges;
+	struct rte_dma_sge *dst_sges = para->dst_sges;
+	const uint16_t kick_batch = para->kick_batch;
+	const uint8_t src_ptrs = para->src_ptrs;
+	const uint8_t dst_ptrs = para->dst_ptrs;
+	const uint16_t dev_id = para->dev_id;
+	uint32_t nr_buf = para->nr_buf;
+	uint64_t async_cnt = 0;
+	uint32_t poll_cnt = 0;
+	uint16_t nr_cpl;
+	uint32_t i, j;
+	int ret;
+
+	nr_buf /= RTE_MAX(src_ptrs, dst_ptrs);
+	worker_info->stop_flag = false;
+	worker_info->ready_flag = true;
+
+	while (!worker_info->start_flag)
+		;
+
+	while (1) {
+		j = 0;
+		for (i = 0; i < nr_buf; i++) {
+dma_copy:
+			ret = rte_dma_copy_sg(dev_id, 0,
+				&src_sges[i * src_ptrs], &dst_sges[j * dst_ptrs],
+				src_ptrs, dst_ptrs, 0);
+			if (unlikely(ret < 0)) {
+				if (ret == -ENOSPC) {
+					do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+					goto dma_copy;
+				} else
+					error_exit(dev_id);
+			}
+			async_cnt++;
+			j++;
+
+			if ((async_cnt % kick_batch) == 0)
+				do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
+		}
+
+		if (worker_info->stop_flag)
+			break;
+	}
+
+	rte_dma_submit(dev_id, 0);
+	while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
+		nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
+		async_cnt -= nr_cpl;
+	}
+
+	return 0;
+}
+
 static inline int
 do_cpu_mem_copy(void *p)
 {
@@ -347,8 +428,9 @@  dummy_free_ext_buf(void *addr, void *opaque)
 }
 
 static int
-setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
-			struct rte_mbuf ***dsts)
+setup_memory_env(struct test_configure *cfg,
+			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
+			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int cur_buf_size = cfg->buf_size.cur;
@@ -409,8 +491,8 @@  setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 	}
 
 	for (i = 0; i < nr_buf; i++) {
-		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
-		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
@@ -446,20 +528,56 @@  setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
 		}
 	}
 
+	if (cfg->is_sg) {
+		uint8_t src_ptrs = cfg->src_ptrs;
+		uint8_t dst_ptrs = cfg->dst_ptrs;
+		uint32_t sglen_src, sglen_dst;
+
+		*src_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*src_sges == NULL) {
+			printf("Error: src_sges array malloc failed.\n");
+			return -1;
+		}
+
+		*dst_sges = rte_zmalloc(NULL, nr_buf * sizeof(struct rte_dma_sge),
+					RTE_CACHE_LINE_SIZE);
+		if (*dst_sges == NULL) {
+			printf("Error: dst_sges array malloc failed.\n");
+			return -1;
+		}
+
+		sglen_src = cur_buf_size / src_ptrs;
+		sglen_dst = cur_buf_size / dst_ptrs;
+
+		for (i = 0; i < nr_buf; i++) {
+			(*src_sges)[i].addr = rte_pktmbuf_iova((*srcs)[i]);
+			(*src_sges)[i].length = sglen_src;
+			if (!((i+1) % src_ptrs))
+				(*src_sges)[i].length += (cur_buf_size % src_ptrs);
+
+			(*dst_sges)[i].addr = rte_pktmbuf_iova((*dsts)[i]);
+			(*dst_sges)[i].length = sglen_dst;
+			if (!((i+1) % dst_ptrs))
+				(*dst_sges)[i].length += (cur_buf_size % dst_ptrs);
+		}
+	}
+
 	return 0;
 }
 
 int
-mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
+mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i;
+	uint32_t i, j;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
+	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
 	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	const uint32_t mcore_id = rte_get_main_lcore();
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
 	uint16_t nb_workers = ldm->cnt;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
@@ -467,12 +585,32 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	uint32_t avg_cycles_total;
 	float mops, mops_total;
 	float bandwidth, bandwidth_total;
+	uint32_t nr_sgsrc = 0, nr_sgdst = 0;
+	uint32_t nr_buf;
 	int ret = 0;
 
-	if (setup_memory_env(cfg, &srcs, &dsts) < 0)
+	/* Align number of buffers according to workers count */
+	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
+	nr_buf -= (nr_buf % nb_workers);
+	if (cfg->is_sg) {
+		nr_buf /= nb_workers;
+		nr_buf -= nr_buf % (cfg->src_ptrs * cfg->dst_ptrs);
+		nr_buf *= nb_workers;
+
+		if (cfg->dst_ptrs > cfg->src_ptrs) {
+			nr_sgsrc = (nr_buf / cfg->dst_ptrs * cfg->src_ptrs);
+			nr_sgdst = nr_buf;
+		} else {
+			nr_sgsrc = nr_buf;
+			nr_sgdst = (nr_buf / cfg->src_ptrs * cfg->dst_ptrs);
+		}
+	}
+
+	cfg->nr_buf = nr_buf;
+	if (setup_memory_env(cfg, &srcs, &dsts, &src_sges, &dst_sges) < 0)
 		goto out;
 
-	if (is_dma)
+	if (cfg->is_dma)
 		if (config_dmadevs(cfg) < 0)
 			goto out;
 
@@ -486,13 +624,23 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	for (i = 0; i < nb_workers; i++) {
 		lcore_id = ldm->lcores[i];
+		if (lcore_id == mcore_id) {
+			printf("lcore parameters can not use main core id %d\n", mcore_id);
+			goto out;
+		}
+
+		if (rte_eal_lcore_role(lcore_id) == ROLE_OFF) {
+			printf("lcore parameters can not use offline core id %d\n", lcore_id);
+			goto out;
+		}
+
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
 			printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
 			break;
 		}
-		if (is_dma) {
+		if (cfg->is_dma) {
 			lcores[i]->dma_name = ldm->dma_names[i];
 			lcores[i]->dev_id = ldm->dma_ids[i];
 			lcores[i]->kick_batch = kick_batch;
@@ -506,10 +654,23 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (is_dma)
-			rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
-		else
+		if (cfg->is_sg) {
+			lcores[i]->src_ptrs = cfg->src_ptrs;
+			lcores[i]->dst_ptrs = cfg->dst_ptrs;
+			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		}
+
+		if (cfg->is_dma) {
+			if (!cfg->is_sg)
+				rte_eal_remote_launch(do_dma_plain_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+			else
+				rte_eal_remote_launch(do_dma_sg_mem_copy, (void *)(lcores[i]),
+					lcore_id);
+		} else {
 			rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
+		}
 	}
 
 	while (1) {
@@ -541,13 +702,53 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 
 	rte_eal_mp_wait_lcore();
 
-	for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-		if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-			   rte_pktmbuf_mtod(dsts[i], void *),
-			   cfg->buf_size.cur) != 0) {
-			printf("Copy validation fails for buffer number %d\n", i);
-			ret = -1;
-			goto out;
+	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
+			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
+					rte_pktmbuf_mtod(dsts[i], void *),
+					cfg->buf_size.cur) != 0) {
+				printf("Copy validation fails for buffer number %d\n", i);
+				ret = -1;
+				goto out;
+			}
+		}
+	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+		size_t src_remsz = buf_size % cfg->src_ptrs;
+		size_t dst_remsz = buf_size % cfg->dst_ptrs;
+		size_t src_sz = buf_size / cfg->src_ptrs;
+		size_t dst_sz = buf_size / cfg->dst_ptrs;
+		uint8_t src[buf_size], dst[buf_size];
+		uint8_t *sbuf, *dbuf, *ptr;
+
+		for (i = 0; i < (nr_buf / RTE_MAX(cfg->src_ptrs, cfg->dst_ptrs)); i++) {
+			sbuf = src;
+			dbuf = dst;
+			ptr = NULL;
+
+			for (j = 0; j < cfg->src_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(srcs[i * cfg->src_ptrs + j], uint8_t *);
+				memcpy(sbuf, ptr, src_sz);
+				sbuf += src_sz;
+			}
+
+			if (src_remsz)
+				memcpy(sbuf, ptr + src_sz, src_remsz);
+
+			for (j = 0; j < cfg->dst_ptrs; j++) {
+				ptr = rte_pktmbuf_mtod(dsts[i * cfg->dst_ptrs + j], uint8_t *);
+				memcpy(dbuf, ptr, dst_sz);
+				dbuf += dst_sz;
+			}
+
+			if (dst_remsz)
+				memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+			if (memcmp(src, dst, buf_size) != 0) {
+				printf("SG Copy validation fails for buffer number %d\n",
+					i * cfg->src_ptrs);
+				ret = -1;
+				goto out;
+			}
 		}
 	}
 
@@ -558,10 +759,8 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
-		output_result(cfg->scenario_id, lcores[i]->lcore_id,
-					lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
-					avg_cycles, buf_size, nr_buf / nb_workers, memory,
-					bandwidth, mops, is_dma);
+		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
+			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
 		bandwidth_total += bandwidth;
 		avg_cycles_total += avg_cycles;
@@ -604,13 +803,20 @@  mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
 	rte_mempool_free(dst_pool);
 	dst_pool = NULL;
 
+	/* free sges for mbufs */
+	rte_free(src_sges);
+	src_sges = NULL;
+
+	rte_free(dst_sges);
+	dst_sges = NULL;
+
 	/* free the worker parameters */
 	for (i = 0; i < nb_workers; i++) {
 		rte_free(lcores[i]);
 		lcores[i] = NULL;
 	}
 
-	if (is_dma) {
+	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
 			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
 			rte_dma_stop(ldm->dma_ids[i]);
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index 9c8221025e..28f6c9d1db 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -38,6 +38,14 @@ 
 
 ; "skip" To skip a test-case set skip to 1.
 
+; Parameters to be configured for SG copy:
+; ========================================
+; "dma_src_sge" denotes number of source segments.
+; "dma_dst_sge" denotes number of destination segments.
+;
+; For SG copy, both the parameters need to be configured and they are valid only
+; when type is DMA_MEM_COPY.
+;
 ; Parameters to be configured for data transfers from "mem to dev" and "dev to mem":
 ; ==================================================================================
 ; "direction" denotes the direction of data transfer. It can take 3 values:
@@ -69,6 +77,21 @@  lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
 [case2]
+type=DMA_MEM_COPY
+mem_size=10
+buf_size=64,8192,2,MUL
+dma_ring_size=1024
+dma_src_sge=4
+dma_dst_sge=1
+kick_batch=32
+src_numa_node=0
+dst_numa_node=0
+cache_flush=0
+test_seconds=2
+lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+eal_args=--in-memory --file-prefix=test
+
+[case3]
 skip=1
 type=DMA_MEM_COPY
 direction=dev2mem
@@ -84,7 +107,7 @@  test_seconds=2
 lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
 eal_args=--in-memory --file-prefix=test
 
-[case3]
+[case4]
 type=CPU_MEM_COPY
 mem_size=10
 buf_size=64,8192,2,MUL
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index df05bcd7df..a27e4c9429 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -108,10 +108,8 @@  run_test_case(struct test_configure *case_cfg)
 
 	switch (case_cfg->test_type) {
 	case TEST_TYPE_DMA_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, true);
-		break;
 	case TEST_TYPE_CPU_MEM_COPY:
-		ret = mem_copy_benchmark(case_cfg, false);
+		ret = mem_copy_benchmark(case_cfg);
 		break;
 	default:
 		printf("Unknown test type. %s\n", case_cfg->test_type_str);
@@ -365,7 +363,8 @@  load_configs(const char *path)
 	const char *case_type;
 	const char *transfer_dir;
 	const char *lcore_dma;
-	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str;
+	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
+		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	struct rte_kvargs *kvlist;
 	const char *vchan_dev;
@@ -467,6 +466,7 @@  load_configs(const char *path)
 			rte_kvargs_free(kvlist);
 		}
 
+		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
 		test_case->dst_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
@@ -501,6 +501,32 @@  load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
+			src_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_src_sge");
+			if (src_ptrs_str != NULL) {
+				test_case->src_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_src_sge"));
+			}
+
+			dst_ptrs_str = rte_cfgfile_get_entry(cfgfile, section_name,
+								"dma_dst_sge");
+			if (dst_ptrs_str != NULL) {
+				test_case->dst_ptrs = (int)atoi(rte_cfgfile_get_entry(cfgfile,
+								section_name, "dma_dst_sge"));
+			}
+
+			if ((src_ptrs_str != NULL && dst_ptrs_str == NULL) ||
+			    (src_ptrs_str == NULL && dst_ptrs_str != NULL)) {
+				printf("parse dma_src_sge, dma_dst_sge error in case %d.\n",
+					i + 1);
+				test_case->is_valid = false;
+				continue;
+			} else if (src_ptrs_str != NULL && dst_ptrs_str != NULL) {
+				test_case->is_sg = true;
+			} else {
+				test_case->is_sg = false;
+			}
+
 			kick_batch_str = rte_cfgfile_get_entry(cfgfile, section_name, "kick_batch");
 			args_nr = parse_entry(kick_batch_str, &test_case->kick_batch);
 			if (args_nr < 0) {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 1123e7524a..baf149b72b 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -53,11 +53,14 @@  struct test_configure {
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
+	bool is_sg;
 	struct lcore_dma_map_t lcore_dma_map;
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint8_t src_ptrs;
+	uint8_t dst_ptrs;
 	uint8_t cache_flush;
 	uint32_t nr_buf;
 	uint16_t test_secs;
@@ -66,6 +69,6 @@  struct test_configure {
 	struct test_vchan_dev_config vchan_dev;
 };
 
-int mem_copy_benchmark(struct test_configure *cfg, bool is_dma);
+int mem_copy_benchmark(struct test_configure *cfg);
 
 #endif /* MAIN_H */