app/dma-perf: support bi-directional transfer

Message ID 20240108082749.1016345-1-amitprakashs@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series app/dma-perf: support bi-directional transfer |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation warning apply issues
ci/loongarch-compilation warning apply patch failure
ci/github-robot: build success github build: passed
ci/iol-testing warning apply patch failure

Commit Message

Amit Prakash Shukla Jan. 8, 2024, 8:27 a.m. UTC
  Adds bi-directional DMA transfer support to test performance.

Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
Depends-on: series-30357 ("PCI Dev and SG copy support")

 app/test-dma-perf/benchmark.c | 89 +++++++++++++++++++++++++----------
 app/test-dma-perf/config.ini  |  5 ++
 app/test-dma-perf/main.c      | 21 +++++++--
 app/test-dma-perf/main.h      |  1 +
 4 files changed, 88 insertions(+), 28 deletions(-)
  

Comments

fengchengwen Feb. 21, 2024, 6:24 a.m. UTC | #1
Hi Amit,

I didn't got the bit-directional meaning, does it mean: one core + one dmadev work for mem2dev
while another core + another dmadev work for mem2dev?

Thanks

On 2024/1/8 16:27, Amit Prakash Shukla wrote:
> Adds bi-directional DMA transfer support to test performance.
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> ---
> Depends-on: series-30357 ("PCI Dev and SG copy support")
> 
>  app/test-dma-perf/benchmark.c | 89 +++++++++++++++++++++++++----------
>  app/test-dma-perf/config.ini  |  5 ++
>  app/test-dma-perf/main.c      | 21 +++++++--
>  app/test-dma-perf/main.h      |  1 +
>  4 files changed, 88 insertions(+), 28 deletions(-)
> 
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> index 4530bd98ce..91ba0f4718 100644
> --- a/app/test-dma-perf/benchmark.c
> +++ b/app/test-dma-perf/benchmark.c
> @@ -144,12 +144,19 @@ cache_flush_buf(__rte_unused struct rte_mbuf **array,
>  
>  static int
>  vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
> -		    struct test_configure *cfg)
> +		    struct test_configure *cfg, uint16_t dev_num)
>  {
>  	struct rte_dma_info info;
>  
>  	qconf->direction = cfg->transfer_dir;
>  
> +	/* If its a bi-directional test, configure odd device for inbound dma
> +	 * transfer and even device for outbound dma transfer.
> +	 */
> +	if (cfg->is_bidir)
> +		qconf->direction = (dev_num % 2) ? RTE_DMA_DIR_MEM_TO_DEV :
> +				   RTE_DMA_DIR_DEV_TO_MEM;
> +
>  	rte_dma_info_get(dev_id, &info);
>  	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
>  		return -1;
> @@ -181,14 +188,15 @@ vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
>  
>  /* Configuration of device. */
>  static void
> -configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
> +configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max,
> +		       uint16_t dev_num)
>  {
>  	uint16_t vchan = 0;
>  	struct rte_dma_info info;
>  	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
>  	struct rte_dma_vchan_conf qconf = { 0 };
>  
> -	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
> +	if (vchan_data_populate(dev_id, &qconf, cfg, dev_num) != 0)
>  		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
>  
>  	if (rte_dma_configure(dev_id, &dev_config) != 0)
> @@ -235,7 +243,7 @@ config_dmadevs(struct test_configure *cfg)
>  		}
>  
>  		ldm->dma_ids[i] = dev_id;
> -		configure_dmadev_queue(dev_id, cfg, ptrs_max);
> +		configure_dmadev_queue(dev_id, cfg, ptrs_max, nb_dmadevs);
>  		++nb_dmadevs;
>  	}
>  
> @@ -433,7 +441,8 @@ setup_memory_env(struct test_configure *cfg,
>  			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
>  {
>  	static struct rte_mbuf_ext_shared_info *ext_buf_info;
> -	unsigned int buf_size = cfg->buf_size.cur;
> +	unsigned int cur_buf_size = cfg->buf_size.cur;
> +	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
>  	unsigned int nr_sockets;
>  	uint32_t nr_buf = cfg->nr_buf;
>  	uint32_t i;
> @@ -449,7 +458,7 @@ setup_memory_env(struct test_configure *cfg,
>  			nr_buf,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> +			buf_size,
>  			cfg->src_numa_node);
>  	if (src_pool == NULL) {
>  		PRINT_ERR("Error with source mempool creation.\n");
> @@ -460,7 +469,7 @@ setup_memory_env(struct test_configure *cfg,
>  			nr_buf,
>  			0,
>  			0,
> -			buf_size + RTE_PKTMBUF_HEADROOM,
> +			buf_size,
>  			cfg->dst_numa_node);
>  	if (dst_pool == NULL) {
>  		PRINT_ERR("Error with destination mempool creation.\n");
> @@ -490,8 +499,8 @@ setup_memory_env(struct test_configure *cfg,
>  	}
>  
>  	for (i = 0; i < nr_buf; i++) {
> -		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
> -		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
> +		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
> +		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
>  	}
>  
>  	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
> @@ -503,24 +512,38 @@ setup_memory_env(struct test_configure *cfg,
>  		}
>  	}
>  
> -	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM && !cfg->is_bidir) {
> +		ext_buf_info->free_cb = dummy_free_ext_buf;
> +		ext_buf_info->fcb_opaque = NULL;
> +		for (i = 0; i < nr_buf; i++) {
> +			/* Using mbuf structure to hold remote iova address. */
> +			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->raddr + (i * buf_size)),
> +						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
> +						  ext_buf_info);
> +			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
> +		}
> +	}
> +
> +	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV && !cfg->is_bidir) {
>  		ext_buf_info->free_cb = dummy_free_ext_buf;
>  		ext_buf_info->fcb_opaque = NULL;
>  		for (i = 0; i < nr_buf; i++) {
>  			/* Using mbuf structure to hold remote iova address. */
> -			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
> -						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
> +			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)(cfg->raddr + (i * buf_size)),
> +						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
> +						  ext_buf_info);
>  			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
>  		}
>  	}
>  
> -	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> +	if (cfg->is_bidir) {
>  		ext_buf_info->free_cb = dummy_free_ext_buf;
>  		ext_buf_info->fcb_opaque = NULL;
>  		for (i = 0; i < nr_buf; i++) {
>  			/* Using mbuf structure to hold remote iova address. */
> -			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
> -						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
> +			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->raddr + (i * buf_size)),
> +						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
> +						  ext_buf_info);
>  			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
>  		}
>  	}
> @@ -646,16 +669,30 @@ mem_copy_benchmark(struct test_configure *cfg)
>  		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
>  		lcores[i]->buf_size = buf_size;
>  		lcores[i]->test_secs = test_secs;
> -		lcores[i]->srcs = srcs + offset;
> -		lcores[i]->dsts = dsts + offset;
>  		lcores[i]->scenario_id = cfg->scenario_id;
>  		lcores[i]->lcore_id = lcore_id;
>  
> -		if (cfg->is_sg) {
> -			lcores[i]->src_ptrs = cfg->src_ptrs;
> -			lcores[i]->dst_ptrs = cfg->dst_ptrs;
> -			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
> -			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +		/* Number of workers is equal to number of devices. In case of bi-directional
> +		 * dma, use 1 device for mem-to-dev and 1 device for dev-to-mem.
> +		 */
> +		if (cfg->is_dma && cfg->is_bidir && (i % 2 != 0)) {
> +			lcores[i]->dsts = srcs + offset;
> +			lcores[i]->srcs = dsts + offset;
> +			if (cfg->is_sg) {
> +				lcores[i]->dst_ptrs = cfg->src_ptrs;
> +				lcores[i]->src_ptrs = cfg->dst_ptrs;
> +				lcores[i]->dst_sges = src_sges + (nr_sgsrc / nb_workers * i);
> +				lcores[i]->src_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +			}
> +		} else {
> +			lcores[i]->srcs = srcs + offset;
> +			lcores[i]->dsts = dsts + offset;
> +			if (cfg->is_sg) {
> +				lcores[i]->src_ptrs = cfg->src_ptrs;
> +				lcores[i]->dst_ptrs = cfg->dst_ptrs;
> +				lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
> +				lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
> +			}
>  		}
>  
>  		if (cfg->is_dma) {
> @@ -699,7 +736,7 @@ mem_copy_benchmark(struct test_configure *cfg)
>  
>  	rte_eal_mp_wait_lcore();
>  
> -	if (!cfg->is_sg) {
> +	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
>  		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
>  			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
>  					rte_pktmbuf_mtod(dsts[i], void *),
> @@ -709,7 +746,7 @@ mem_copy_benchmark(struct test_configure *cfg)
>  				goto out;
>  			}
>  		}
> -	} else {
> +	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
>  		size_t src_remsz = buf_size % cfg->src_ptrs;
>  		size_t dst_remsz = buf_size % cfg->dst_ptrs;
>  		size_t src_sz = buf_size / cfg->src_ptrs;
> @@ -756,6 +793,8 @@ mem_copy_benchmark(struct test_configure *cfg)
>  		calc_result(buf_size, nr_buf, nb_workers, test_secs,
>  			lcores[i]->worker_info.test_cpl,
>  			&memory, &avg_cycles, &bandwidth, &mops);
> +		if (cfg->is_bidir)
> +			printf("%s direction\n", i % 2 ? "MEM-to-DEV" : "DEV-to-MEM");
>  		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
>  			nr_buf / nb_workers, memory, bandwidth, mops);
>  		mops_total += mops;
> @@ -769,7 +808,7 @@ mem_copy_benchmark(struct test_configure *cfg)
>  
>  out:
>  
> -	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
> +	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM || cfg->is_bidir)
>  		m = srcs;
>  	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
>  		m = dsts;
> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> index f460b93414..a764ef7e7f 100644
> --- a/app/test-dma-perf/config.ini
> +++ b/app/test-dma-perf/config.ini
> @@ -55,6 +55,10 @@
>  ; "pfid" denotes PF-id to be used for data transfer
>  ; "vfid" denotes VF-id of PF-id to be used for data transfer.
>  
> +; "xfer_mode" denotes mode of data transfer. It can take 2 values:
> +;    0 - unidirection transfer based on direction configured (default).
> +;    1 - Bi-directional transfer based on direction configured (mem-to-dev and dev-to-mem).
> +
>  ; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
>  
>  [case1]
> @@ -89,6 +93,7 @@ eal_args=--in-memory --file-prefix=test
>  skip=1
>  type=DMA_MEM_COPY
>  direction=2
> +xfer_mode=0
>  raddr=0x200000000
>  scoreid=0
>  dcoreid=0
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> index e81eca14e1..be91405305 100644
> --- a/app/test-dma-perf/main.c
> +++ b/app/test-dma-perf/main.c
> @@ -342,6 +342,7 @@ load_configs(const char *path)
>  		*src_ptrs_str, *dst_ptrs_str;
>  	const char *skip;
>  	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
> +	const char *xfer_mode;
>  	int args_nr, nb_vp;
>  	bool is_dma;
>  
> @@ -393,6 +394,20 @@ load_configs(const char *path)
>  				test_case->is_valid = false;
>  				continue;
>  			}
> +			xfer_mode = rte_cfgfile_get_entry(cfgfile, section_name, "xfer_mode");
> +			if (xfer_mode) {
> +				int xmode = atoi(xfer_mode);
> +				if (xmode == 1) {
> +					if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
> +						printf("Error: Invalid configuration. For mem to"
> +						       " mem dma transfer bi-directional cannot be"
> +						       " configured.\n");
> +						test_case->is_valid = false;
> +						continue;
> +					}
> +					test_case->is_bidir = true;
> +				}
> +			}
>  			is_dma = true;
>  		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
>  			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
> @@ -405,7 +420,7 @@ load_configs(const char *path)
>  		}
>  
>  		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
> -			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +		    test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
>  			char *endptr;
>  
>  			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
> @@ -434,7 +449,7 @@ load_configs(const char *path)
>  
>  		}
>  
> -		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
> +		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM || test_case->is_bidir) {
>  			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
>  			if (scoreid == NULL) {
>  				printf("Error: No scoreid configured for case%d.\n", i + 1);
> @@ -444,7 +459,7 @@ load_configs(const char *path)
>  			test_case->scoreid = (uint8_t)atoi(scoreid);
>  		}
>  
> -		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
> +		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV || test_case->is_bidir) {
>  			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
>  			if (dcoreid == NULL) {
>  				printf("Error: No dcoreid configured for case%d.\n", i + 1);
> diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
> index 31e0bf71c9..478c2a1c6d 100644
> --- a/app/test-dma-perf/main.h
> +++ b/app/test-dma-perf/main.h
> @@ -66,6 +66,7 @@ struct test_configure {
>  	uint8_t pfid;
>  	uint16_t vfid;
>  	uintptr_t raddr;
> +	bool is_bidir;
>  };
>  
>  int mem_copy_benchmark(struct test_configure *cfg);
>
  
Amit Prakash Shukla Feb. 27, 2024, 9:53 a.m. UTC | #2
Hi Chengwen,

Please see my reply in-line.

Thanks

> -----Original Message-----
> From: fengchengwen <fengchengwen@huawei.com>
> Sent: Wednesday, February 21, 2024 11:54 AM
> To: Amit Prakash Shukla <amitprakashs@marvell.com>; Cheng Jiang
> <honest.jiang@foxmail.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; Nithin Kumar Dabilpuram
> <ndabilpuram@marvell.com>; Anoob Joseph <anoobj@marvell.com>;
> Gowrishankar Muthukrishnan <gmuthukrishn@marvell.com>
> Subject: [EXT] Re: [PATCH] app/dma-perf: support bi-directional transfer
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Amit,
> 
> I didn't got the bit-directional meaning, does it mean: one core + one dmadev
> work for mem2dev while another core + another dmadev work for mem2dev?
> 
> Thanks

Yes, 1 core + 1 dma dev for mem2dev transfer and another core + another dma dev for dev2mem transfer.

> 
> On 2024/1/8 16:27, Amit Prakash Shukla wrote:
> > Adds bi-directional DMA transfer support to test performance.
> >
> > Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> > ---
> > Depends-on: series-30357 ("PCI Dev and SG copy support")

<snip>
  

Patch

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index 4530bd98ce..91ba0f4718 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -144,12 +144,19 @@  cache_flush_buf(__rte_unused struct rte_mbuf **array,
 
 static int
 vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
-		    struct test_configure *cfg)
+		    struct test_configure *cfg, uint16_t dev_num)
 {
 	struct rte_dma_info info;
 
 	qconf->direction = cfg->transfer_dir;
 
+	/* If its a bi-directional test, configure odd device for inbound dma
+	 * transfer and even device for outbound dma transfer.
+	 */
+	if (cfg->is_bidir)
+		qconf->direction = (dev_num % 2) ? RTE_DMA_DIR_MEM_TO_DEV :
+				   RTE_DMA_DIR_DEV_TO_MEM;
+
 	rte_dma_info_get(dev_id, &info);
 	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
 		return -1;
@@ -181,14 +188,15 @@  vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t ptrs_max,
+		       uint16_t dev_num)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
 	struct rte_dma_vchan_conf qconf = { 0 };
 
-	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+	if (vchan_data_populate(dev_id, &qconf, cfg, dev_num) != 0)
 		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
@@ -235,7 +243,7 @@  config_dmadevs(struct test_configure *cfg)
 		}
 
 		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg, ptrs_max);
+		configure_dmadev_queue(dev_id, cfg, ptrs_max, nb_dmadevs);
 		++nb_dmadevs;
 	}
 
@@ -433,7 +441,8 @@  setup_memory_env(struct test_configure *cfg,
 			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
 	static struct rte_mbuf_ext_shared_info *ext_buf_info;
-	unsigned int buf_size = cfg->buf_size.cur;
+	unsigned int cur_buf_size = cfg->buf_size.cur;
+	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
 	unsigned int nr_sockets;
 	uint32_t nr_buf = cfg->nr_buf;
 	uint32_t i;
@@ -449,7 +458,7 @@  setup_memory_env(struct test_configure *cfg,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->src_numa_node);
 	if (src_pool == NULL) {
 		PRINT_ERR("Error with source mempool creation.\n");
@@ -460,7 +469,7 @@  setup_memory_env(struct test_configure *cfg,
 			nr_buf,
 			0,
 			0,
-			buf_size + RTE_PKTMBUF_HEADROOM,
+			buf_size,
 			cfg->dst_numa_node);
 	if (dst_pool == NULL) {
 		PRINT_ERR("Error with destination mempool creation.\n");
@@ -490,8 +499,8 @@  setup_memory_env(struct test_configure *cfg,
 	}
 
 	for (i = 0; i < nr_buf; i++) {
-		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), buf_size);
-		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, buf_size);
+		memset(rte_pktmbuf_mtod((*srcs)[i], void *), rte_rand(), cur_buf_size);
+		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
 	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
@@ -503,24 +512,38 @@  setup_memory_env(struct test_configure *cfg,
 		}
 	}
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM && !cfg->is_bidir) {
+		ext_buf_info->free_cb = dummy_free_ext_buf;
+		ext_buf_info->fcb_opaque = NULL;
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->raddr + (i * buf_size)),
+						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
+						  ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV && !cfg->is_bidir) {
 		ext_buf_info->free_cb = dummy_free_ext_buf;
 		ext_buf_info->fcb_opaque = NULL;
 		for (i = 0; i < nr_buf; i++) {
 			/* Using mbuf structure to hold remote iova address. */
-			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)cfg->raddr,
-						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)(cfg->raddr + (i * buf_size)),
+						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
+						  ext_buf_info);
 			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
 		}
 	}
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+	if (cfg->is_bidir) {
 		ext_buf_info->free_cb = dummy_free_ext_buf;
 		ext_buf_info->fcb_opaque = NULL;
 		for (i = 0; i < nr_buf; i++) {
 			/* Using mbuf structure to hold remote iova address. */
-			rte_pktmbuf_attach_extbuf((*dsts)[i], (void *)cfg->raddr,
-						  (rte_iova_t)cfg->raddr, 0, ext_buf_info);
+			rte_pktmbuf_attach_extbuf((*srcs)[i], (void *)(cfg->raddr + (i * buf_size)),
+						  (rte_iova_t)(cfg->raddr + (i * buf_size)), 0,
+						  ext_buf_info);
 			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
 		}
 	}
@@ -646,16 +669,30 @@  mem_copy_benchmark(struct test_configure *cfg)
 		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
 		lcores[i]->buf_size = buf_size;
 		lcores[i]->test_secs = test_secs;
-		lcores[i]->srcs = srcs + offset;
-		lcores[i]->dsts = dsts + offset;
 		lcores[i]->scenario_id = cfg->scenario_id;
 		lcores[i]->lcore_id = lcore_id;
 
-		if (cfg->is_sg) {
-			lcores[i]->src_ptrs = cfg->src_ptrs;
-			lcores[i]->dst_ptrs = cfg->dst_ptrs;
-			lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
-			lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+		/* Number of workers is equal to number of devices. In case of bi-directional
+		 * dma, use 1 device for mem-to-dev and 1 device for dev-to-mem.
+		 */
+		if (cfg->is_dma && cfg->is_bidir && (i % 2 != 0)) {
+			lcores[i]->dsts = srcs + offset;
+			lcores[i]->srcs = dsts + offset;
+			if (cfg->is_sg) {
+				lcores[i]->dst_ptrs = cfg->src_ptrs;
+				lcores[i]->src_ptrs = cfg->dst_ptrs;
+				lcores[i]->dst_sges = src_sges + (nr_sgsrc / nb_workers * i);
+				lcores[i]->src_sges = dst_sges + (nr_sgdst / nb_workers * i);
+			}
+		} else {
+			lcores[i]->srcs = srcs + offset;
+			lcores[i]->dsts = dsts + offset;
+			if (cfg->is_sg) {
+				lcores[i]->src_ptrs = cfg->src_ptrs;
+				lcores[i]->dst_ptrs = cfg->dst_ptrs;
+				lcores[i]->src_sges = src_sges + (nr_sgsrc / nb_workers * i);
+				lcores[i]->dst_sges = dst_sges + (nr_sgdst / nb_workers * i);
+			}
 		}
 
 		if (cfg->is_dma) {
@@ -699,7 +736,7 @@  mem_copy_benchmark(struct test_configure *cfg)
 
 	rte_eal_mp_wait_lcore();
 
-	if (!cfg->is_sg) {
+	if (!cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
 		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
 			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
 					rte_pktmbuf_mtod(dsts[i], void *),
@@ -709,7 +746,7 @@  mem_copy_benchmark(struct test_configure *cfg)
 				goto out;
 			}
 		}
-	} else {
+	} else if (cfg->is_sg && cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
 		size_t src_remsz = buf_size % cfg->src_ptrs;
 		size_t dst_remsz = buf_size % cfg->dst_ptrs;
 		size_t src_sz = buf_size / cfg->src_ptrs;
@@ -756,6 +793,8 @@  mem_copy_benchmark(struct test_configure *cfg)
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
+		if (cfg->is_bidir)
+			printf("%s direction\n", i % 2 ? "MEM-to-DEV" : "DEV-to-MEM");
 		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
 			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
@@ -769,7 +808,7 @@  mem_copy_benchmark(struct test_configure *cfg)
 
 out:
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
+	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM || cfg->is_bidir)
 		m = srcs;
 	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
 		m = dsts;
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index f460b93414..a764ef7e7f 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -55,6 +55,10 @@ 
 ; "pfid" denotes PF-id to be used for data transfer
 ; "vfid" denotes VF-id of PF-id to be used for data transfer.
 
+; "xfer_mode" denotes mode of data transfer. It can take 2 values:
+;    0 - unidirection transfer based on direction configured (default).
+;    1 - Bi-directional transfer based on direction configured (mem-to-dev and dev-to-mem).
+
 ; =========== End of "mem to dev" and "dev to mem" config parameters. ==============
 
 [case1]
@@ -89,6 +93,7 @@  eal_args=--in-memory --file-prefix=test
 skip=1
 type=DMA_MEM_COPY
 direction=2
+xfer_mode=0
 raddr=0x200000000
 scoreid=0
 dcoreid=0
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index e81eca14e1..be91405305 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -342,6 +342,7 @@  load_configs(const char *path)
 		*src_ptrs_str, *dst_ptrs_str;
 	const char *skip;
 	const char *raddr, *scoreid, *dcoreid, *vfid, *pfid;
+	const char *xfer_mode;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -393,6 +394,20 @@  load_configs(const char *path)
 				test_case->is_valid = false;
 				continue;
 			}
+			xfer_mode = rte_cfgfile_get_entry(cfgfile, section_name, "xfer_mode");
+			if (xfer_mode) {
+				int xmode = atoi(xfer_mode);
+				if (xmode == 1) {
+					if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM) {
+						printf("Error: Invalid configuration. For mem to"
+						       " mem dma transfer bi-directional cannot be"
+						       " configured.\n");
+						test_case->is_valid = false;
+						continue;
+					}
+					test_case->is_bidir = true;
+				}
+			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -405,7 +420,7 @@  load_configs(const char *path)
 		}
 
 		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
-			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		    test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
 			char *endptr;
 
 			raddr = rte_cfgfile_get_entry(cfgfile, section_name, "raddr");
@@ -434,7 +449,7 @@  load_configs(const char *path)
 
 		}
 
-		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
+		if (test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM || test_case->is_bidir) {
 			scoreid = rte_cfgfile_get_entry(cfgfile, section_name, "scoreid");
 			if (scoreid == NULL) {
 				printf("Error: No scoreid configured for case%d.\n", i + 1);
@@ -444,7 +459,7 @@  load_configs(const char *path)
 			test_case->scoreid = (uint8_t)atoi(scoreid);
 		}
 
-		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
+		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV || test_case->is_bidir) {
 			dcoreid = rte_cfgfile_get_entry(cfgfile, section_name, "dcoreid");
 			if (dcoreid == NULL) {
 				printf("Error: No dcoreid configured for case%d.\n", i + 1);
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index 31e0bf71c9..478c2a1c6d 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -66,6 +66,7 @@  struct test_configure {
 	uint8_t pfid;
 	uint16_t vfid;
 	uintptr_t raddr;
+	bool is_bidir;
 };
 
 int mem_copy_benchmark(struct test_configure *cfg);