[v3,2/2] dma/cnxk: rewrite DMA fastpath

Message ID 20230830165441.9188-2-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series [v3,1/2] dma/cnxk: use mempool for DMA chunk pool |

Checks

Context Check Description
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/checkpatch success coding style OK
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-compile-arm64-testing fail Testing issues
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS

Commit Message

Pavan Nikhilesh Bhagavatula Aug. 30, 2023, 4:54 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   9 +-
 4 files changed, 577 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c
  

Patch

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@ 
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@  cnxk_dmadev_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
 	return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -217,57 +191,11 @@  cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn9k.fport = conf->dst_port.pcie.coreid;
 		header->cn9k.pvfe = 0;
 	};
-
-	/* Free up descriptor memory before allocating. */
-	cnxk_dmadev_vchan_free(dpivf, vchan);
-
-	max_desc = conf->nb_desc;
-	if (!rte_is_power_of_2(max_desc))
-		max_desc = rte_align32pow2(max_desc);
-
-	if (max_desc > DPI_MAX_DESC)
-		max_desc = DPI_MAX_DESC;
-
-	size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-	dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-	if (dpi_conf->c_desc.compl_ptr == NULL) {
-		plt_err("Failed to allocate for comp_data");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < max_desc; i++) {
-		dpi_conf->c_desc.compl_ptr[i] =
-			rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-		if (!dpi_conf->c_desc.compl_ptr[i]) {
-			plt_err("Failed to allocate for descriptor memory");
-			return -ENOMEM;
-		}
-
-		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-	}
-
-	dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-	return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-			 const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct rte_dma_vchan_conf *conf)
 {
-	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	uint16_t max_desc;
-	uint32_t size;
-	int i;
-
-	RTE_SET_USED(conf_sz);
-
-	if (dpivf->flag & CNXK_DPI_DEV_START)
-		return 0;
-
 	header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
 	switch (conf->direction) {
@@ -303,6 +231,29 @@  cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 		header->cn10k.fport = conf->dst_port.pcie.coreid;
 		header->cn10k.pvfe = 0;
 	};
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	union cnxk_dpi_instr_cmd *header;
+	uint16_t max_desc;
+	uint32_t size;
+	int i;
+
+	RTE_SET_USED(conf_sz);
+
+	header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+	if (dpivf->flag & CNXK_DPI_DEV_START)
+		return 0;
+
+	if (dpivf->is_cn10k)
+		cn10k_dmadev_setup_hdr(header, conf);
+	else
+		cn9k_dmadev_setup_hdr(header, conf);
 
 	/* Free up descriptor memory before allocating. */
 	cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@  cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
 			plt_err("Failed to allocate for descriptor memory");
 			return -ENOMEM;
 		}
+
 		dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
 	}
 
@@ -374,6 +326,11 @@  static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
 	struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+	uint64_t reg;
+
+	reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+	while (!(reg & BIT_ULL(63)))
+		reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
 	roc_dpi_disable(&dpivf->rdpi);
 	dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@  cnxk_dmadev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-	uint64_t *ptr = dpi->chunk_base;
-
-	if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || cmds == NULL)
-		return -EINVAL;
-
-	/*
-	 * Normally there is plenty of room in the current buffer for the
-	 * command
-	 */
-	if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-		ptr += dpi->chunk_head;
-		dpi->chunk_head += cmd_count;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-	} else {
-		uint64_t *new_buff = NULL;
-		int count;
-
-		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-			plt_dpi_dbg("Failed to alloc next buffer from NPA");
-			return -ENOMEM;
-		}
-
-		/*
-		 * Figure out how many cmd words will fit in this buffer.
-		 * One location will be needed for the next buffer pointer.
-		 */
-		count = dpi->chunk_size_m1 - dpi->chunk_head;
-		ptr += dpi->chunk_head;
-		cmd_count -= count;
-		while (count--)
-			*ptr++ = *cmds++;
-
-		/*
-		 * chunk next ptr is 2 DWORDS
-		 * second DWORD is reserved.
-		 */
-		*ptr++ = (uint64_t)new_buff;
-		*ptr = 0;
-
-		/*
-		 * The current buffer is full and has a link to the next
-		 * buffers. Time to write the rest of the commands into the new
-		 * buffer.
-		 */
-		dpi->chunk_base = new_buff;
-		dpi->chunk_head = cmd_count;
-		ptr = new_buff;
-		while (cmd_count--)
-			*ptr++ = *cmds++;
-
-		/* queue index may be greater than pool size */
-		if (dpi->chunk_head == dpi->chunk_size_m1) {
-			if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-				plt_dpi_dbg("Failed to alloc next buffer from NPA");
-				return -ENOMEM;
-			}
-			/* Write next buffer address */
-			*ptr = (uint64_t)new_buff;
-			dpi->chunk_base = new_buff;
-			dpi->chunk_head = 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
-		 uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn9k.nfst = 1;
-	header->cn9k.nlst = 1;
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		fptr = dst;
-		lptr = src;
-	} else {
-		fptr = src;
-		lptr = dst;
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn9k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	/*
-	 * For inbound case, src pointers are last pointers.
-	 * For all other cases, src pointers are first pointers.
-	 */
-	if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-		header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-		fptr = &dst[0];
-		lptr = &src[0];
-	} else {
-		header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-		header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-		fptr = &src[0];
-		lptr = &dst[0];
-	}
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-	for (i = 0; i < header->cn9k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn9k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
-		  uint32_t length, uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	rte_iova_t fptr, lptr;
-	int num_words = 0;
-	int rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = 1;
-	header->cn10k.nlst = 1;
-
-	fptr = src;
-	lptr = dst;
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	/* word3 is always 0 */
-	num_words += 4;
-	cmd[num_words++] = length;
-	cmd[num_words++] = fptr;
-	cmd[num_words++] = length;
-	cmd[num_words++] = lptr;
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
-		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
-		     uint64_t flags)
-{
-	struct cnxk_dpi_vf_s *dpivf = dev_private;
-	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-	union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-	const struct rte_dma_sge *fptr, *lptr;
-	struct cnxk_dpi_compl_s *comp_ptr;
-	uint64_t cmd[DPI_MAX_CMD_SIZE];
-	int num_words = 0;
-	int i, rc;
-
-	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-	header->cn10k.ptr = (uint64_t)comp_ptr;
-	STRM_INC(dpi_conf->c_desc, tail);
-
-	if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return -ENOSPC;
-	}
-
-	header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-	header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-	fptr = &src[0];
-	lptr = &dst[0];
-
-	cmd[0] = header->u[0];
-	cmd[1] = header->u[1];
-	cmd[2] = header->u[2];
-	num_words += 4;
-
-	for (i = 0; i < header->cn10k.nfst; i++) {
-		cmd[num_words++] = (uint64_t)fptr->length;
-		cmd[num_words++] = fptr->addr;
-		fptr++;
-	}
-
-	for (i = 0; i < header->cn10k.nlst; i++) {
-		cmd[num_words++] = (uint64_t)lptr->length;
-		cmd[num_words++] = lptr->addr;
-		lptr++;
-	}
-
-	rc = __dpi_queue_write(dpivf, cmd, num_words);
-	if (unlikely(rc)) {
-		STRM_DEC(dpi_conf->c_desc, tail);
-		return rc;
-	}
-
-	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-		rte_wmb();
-		plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-		dpi_conf->stats.submitted++;
-	} else {
-		dpi_conf->pnum_words += num_words;
-		dpi_conf->pending++;
-	}
-
-	return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t nb_cpls, uint16_t *last_idx,
 		      bool *has_error)
@@ -880,17 +511,6 @@  cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
 	return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-	.dev_close = cnxk_dmadev_close,
-	.dev_configure = cnxk_dmadev_configure,
-	.dev_info_get = cnxk_dmadev_info_get,
-	.dev_start = cnxk_dmadev_start,
-	.dev_stop = cnxk_dmadev_stop,
-	.stats_get = cnxk_stats_get,
-	.stats_reset = cnxk_stats_reset,
-	.vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
 	.dev_close = cnxk_dmadev_close,
 	.dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@  cnxk_dmadev_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_de
 	dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
 	dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-	if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-	    pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-		dmadev->dev_ops = &cn10k_dmadev_ops;
+	if (roc_model_is_cn10k()) {
+		dpivf->is_cn10k = true;
 		dmadev->fp_obj->copy = cn10k_dmadev_copy;
 		dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
 	}
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@ 
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER	     15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
-#define DPI_MAX_DESC	     2048
-#define DPI_MIN_DESC	     2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER	       15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : ((s).var - 1))
+#define DPI_MAX_DESC	       2048
+#define DPI_MIN_DESC	       2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@ 
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG	   (1ULL << 0)
+#define CNXK_DPI_DEV_START	   (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+	uint64_t u;
+	struct cn9k_dpi_instr_cmd {
+		uint64_t aura : 20;
+		uint64_t func : 16;
+		uint64_t pt : 2;
+		uint64_t reserved_102 : 1;
+		uint64_t pvfe : 1;
+		uint64_t fl : 1;
+		uint64_t ii : 1;
+		uint64_t fi : 1;
+		uint64_t ca : 1;
+		uint64_t csel : 1;
+		uint64_t reserved_109_111 : 3;
+		uint64_t xtype : 2;
+		uint64_t reserved_114_119 : 6;
+		uint64_t fport : 2;
+		uint64_t reserved_122_123 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_126_127 : 2;
+		/* Word 1 - End */
+	} cn9k;
+
+	struct cn10k_dpi_instr_cmd {
+		uint64_t nfst : 4;
+		uint64_t reserved_4_5 : 2;
+		uint64_t nlst : 4;
+		uint64_t reserved_10_11 : 2;
+		uint64_t pvfe : 1;
+		uint64_t reserved_13 : 1;
+		uint64_t func : 16;
+		uint64_t aura : 20;
+		uint64_t xtype : 2;
+		uint64_t reserved_52_53 : 2;
+		uint64_t pt : 2;
+		uint64_t fport : 2;
+		uint64_t reserved_58_59 : 2;
+		uint64_t lport : 2;
+		uint64_t reserved_62_63 : 2;
+		/* Word 0 - End */
+	} cn10k;
+};
 
 struct cnxk_dpi_compl_s {
 	uint64_t cdata;
@@ -37,26 +93,39 @@  struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-	union dpi_instr_hdr_s hdr;
+	union cnxk_dpi_instr_cmd cmd;
 	struct cnxk_dpi_cdesc_data_s c_desc;
 	uint16_t pnum_words;
 	uint16_t pending;
 	uint16_t desc_idx;
-	uint16_t pad0;
 	struct rte_dma_stats stats;
 	uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+	/* Fast path*/
 	uint64_t *chunk_base;
 	uint16_t chunk_head;
 	uint16_t chunk_size_m1;
 	struct rte_mempool *chunk_pool;
 	struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+	/* Slow path */
 	struct roc_dpi rdpi;
 	uint32_t aura;
 	uint16_t num_vchans;
 	uint16_t flag;
+	uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		     uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		      uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+			 const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+			 uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..db1e57bf51
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN	      4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+}
+
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		*dst++ = src[i].length;
+		*dst++ = src[i].addr;
+		lmt -= 2;
+	}
+
+	return i;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i += 2) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vst1q_u64(&dst[i], vec);
+	}
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+	}
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+	uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+	uint64x2_t vec;
+	uint8_t i;
+
+	for (i = 0; i < n && lmt; i++) {
+		vec = vld1q_u64((const uint64_t *)&src[i]);
+		vec = vextq_u64(vec, vec, 1);
+		vec = vandq_u64(vec, mask);
+		vst1q_u64(dst, vec);
+		dst += 2;
+		lmt -= 2;
+	}
+
+	return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector(src, dst, n);
+#else
+	__dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+	__dpi_cpy_vector_sg(src, dst, n);
+#else
+	__dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+	return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+	return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+		dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+	} else {
+		uint64_t *new_buff = NULL;
+		int count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy_scalar(cmd, ptr, count);
+
+		ptr += count;
+		*ptr = (uint64_t)new_buff;
+		ptr = new_buff;
+
+		__dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - count);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+	}
+
+	return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst)
+{
+	uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+	uint64_t *ptr = dpi->chunk_base;
+
+	/*
+	 * Normally there is plenty of room in the current buffer for the
+	 * command
+	 */
+	if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+		ptr += dpi->chunk_head;
+
+		__dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+		ptr += DMA_HDR_LEN;
+		__dpi_cpy_sg(src, ptr, nb_src);
+		ptr += (nb_src << 1);
+		__dpi_cpy_sg(dst, ptr, nb_dst);
+
+		dpi->chunk_head += cmd_len;
+	} else {
+		uint64_t *new_buff = NULL, *buf;
+		uint16_t count;
+
+		if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+			plt_dpi_dbg("Failed to alloc next buffer from NPA");
+			return -ENOSPC;
+		}
+
+		/*
+		 * Figure out how many cmd words will fit in this buffer.
+		 * One location will be needed for the next buffer pointer.
+		 */
+		count = dpi->chunk_size_m1 - dpi->chunk_head;
+		ptr += dpi->chunk_head;
+		buf = new_buff;
+		if (count <= 4) {
+			__dpi_cpy(hdr, ptr, count);
+			ptr += count;
+			__dpi_cpy(&hdr[count], buf, 4);
+			buf += (4 - count);
+		} else {
+			uint8_t i;
+
+			__dpi_cpy(hdr, ptr, 4);
+			ptr += 4;
+			count -= 4;
+
+			i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+			src += i;
+			nb_src -= i;
+			count -= (i << 1);
+			ptr += (i << 1);
+
+			i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+			dst += i;
+			nb_dst -= i;
+			ptr += (i << 1);
+		}
+		*ptr = (uint64_t)new_buff;
+
+		__dpi_cpy_sg(src, buf, nb_src);
+		buf += (nb_src << 1);
+
+		__dpi_cpy_sg(dst, buf, nb_dst);
+		buf += (nb_dst << 1);
+
+		/*
+		 * The current buffer is full and has a link to the next
+		 * buffers. Time to write the rest of the commands into
+		 * the new buffer.
+		 */
+		dpi->chunk_base = new_buff;
+		dpi->chunk_head = buf - new_buff;
+	}
+
+	return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst, uint32_t length,
+		 uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = (1UL << 54) | (1UL << 48);
+	cmd[1] = dpi_conf->cmd.u;
+	cmd[2] = (uint64_t)comp_ptr;
+	cmd[4] = length;
+	cmd[6] = length;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		cmd[5] = dst;
+		cmd[7] = src;
+	} else {
+		cmd[5] = src;
+		cmd[7] = dst;
+	}
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	const struct rte_dma_sge *fptr, *lptr;
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[1] = dpi_conf->cmd.u;
+	hdr[2] = (uint64_t)comp_ptr;
+
+	/*
+	 * For inbound case, src pointers are last pointers.
+	 * For all other cases, src pointers are first pointers.
+	 */
+	if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == DPI_XTYPE_INBOUND) {
+		fptr = dst;
+		lptr = src;
+		RTE_SWAP(nb_src, nb_dst);
+	} else {
+		fptr = src;
+		lptr = dst;
+	}
+	hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+		  uint32_t length, uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+	cmd[1] = (uint64_t)comp_ptr;
+	cmd[2] = 0;
+	cmd[4] = length;
+	cmd[5] = src;
+	cmd[6] = length;
+	cmd[7] = dst;
+
+	rc = __dpi_queue_write_single(dpivf, cmd);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += 8;
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct rte_dma_sge *src,
+		     const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		     uint64_t flags)
+{
+	struct cnxk_dpi_vf_s *dpivf = dev_private;
+	struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+	struct cnxk_dpi_compl_s *comp_ptr;
+	uint64_t hdr[4];
+	int rc;
+
+	if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+		     dpi_conf->c_desc.head))
+		return -ENOSPC;
+
+	comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+	STRM_INC(dpi_conf->c_desc, tail);
+
+	hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+	hdr[1] = (uint64_t)comp_ptr;
+	hdr[2] = 0;
+
+	rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+	if (unlikely(rc)) {
+		STRM_DEC(dpi_conf->c_desc, tail);
+		return rc;
+	}
+
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+		rte_wmb();
+		plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+			    dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+		dpi_conf->stats.submitted += dpi_conf->pending + 1;
+		dpi_conf->pnum_words = 0;
+		dpi_conf->pending = 0;
+	} else {
+		dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+		dpi_conf->pending++;
+	}
+
+	return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..e557349368 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -1,6 +1,13 @@ 
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(C) 2021 Marvell International Ltd.
 
+error_cflags = ['-Wno-uninitialized']
+foreach flag: error_cflags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false