[05/11] event/cnxk: net/cnxk: support transmit completion

Message ID 20221128095442.3185112-5-ndabilpuram@marvell.com (mailing list archive)
State Accepted, archived
Delegated to: Jerin Jacob
Headers
Series [01/11] common/cnxk: free pending sqe buffers |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Nithin Dabilpuram Nov. 28, 2022, 9:54 a.m. UTC
  From: Rakesh Kudurumalla <rkudurumalla@marvell.com>

added support to call callback handler provided
by user when external buffer is attached to mbuf
and transmit completion is enabled.Added support to
enable transmit completion as device args

Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
---
 doc/guides/nics/cnxk.rst               |  14 ++
 drivers/event/cnxk/cn10k_worker.h      |   7 +-
 drivers/event/cnxk/cn9k_worker.h       |   8 +-
 drivers/net/cnxk/cn10k_ethdev.c        |  54 ++++++
 drivers/net/cnxk/cn10k_ethdev.h        |   1 +
 drivers/net/cnxk/cn10k_tx.h            | 215 +++++++++++++++++++----
 drivers/net/cnxk/cn9k_ethdev.c         |  54 ++++++
 drivers/net/cnxk/cn9k_ethdev.h         |   1 +
 drivers/net/cnxk/cn9k_tx.h             | 226 +++++++++++++++++++++----
 drivers/net/cnxk/cnxk_ethdev.c         |  28 ++-
 drivers/net/cnxk/cnxk_ethdev.h         |  17 ++
 drivers/net/cnxk/cnxk_ethdev_devargs.c |   6 +
 12 files changed, 553 insertions(+), 78 deletions(-)
  

Patch

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index 7da6cb3967..be176b53a2 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -361,6 +361,20 @@  Runtime Config Options
 
       -a 0002:1d:00.0,sdp_channel_mask=0x700/0xf00
 
+- ``Transmit completion handler`` (default ``0``)
+
+   When transmit completion handler is enabled , PMD invokes callback handler
+   provided by application for every packet which has external buf attached to mbuf
+   and frees main mbuf, external buffer is provided to applicatoin. Once external
+   buffer is handed over to application, its application responsibility either to
+   free of reuse external buffer
+
+   using ``tx_compl_ena`` ``devargs`` parameter.
+
+   For example::
+
+      -a 0002:01:00.1,tx_compl_ena=1
+
    With the above configuration, RTE Flow rules API will set the channel
    and channel mask as 0x700 and 0xF00 in the MCAM entries of the  flow rules
    created on the SDP device. This option needs to be used when more than one
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 75a2ff244a..332a2e27c2 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -559,6 +559,9 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	if (cn10k_sso_sq_depth(txq) <= 0)
 		return 0;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, 1, 1);
+
 	cn10k_nix_tx_skeleton(txq, cmd, flags, 0);
 	/* Perform header writes before barrier
 	 * for TSO
@@ -566,7 +569,7 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		cn10k_nix_xmit_prepare_tso(m, flags);
 
-	cn10k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt, &sec,
+	cn10k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, &sec,
 			       txq->mark_flag, txq->mark_fmt);
 
 	laddr = lmt_addr;
@@ -581,7 +584,7 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
 
 	if (flags & NIX_TX_MULTI_SEG_F)
-		segdw = cn10k_nix_prepare_mseg(m, (uint64_t *)laddr, flags);
+		segdw = cn10k_nix_prepare_mseg(txq, m, (uint64_t *)laddr, flags);
 	else
 		segdw = cn10k_nix_tx_ext_subs(flags) + 2;
 
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 4c3932da47..54213db3b4 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -781,12 +781,16 @@  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	    !(flags & NIX_TX_OFFLOAD_SECURITY_F))
 		rte_io_wmb();
 	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, 1, 1);
+
 	if (((txq->nb_sqb_bufs_adj -
 	      __atomic_load_n((int16_t *)txq->fc_mem, __ATOMIC_RELAXED))
 	     << txq->sqes_per_sqb_log2) <= 0)
 		return 0;
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
-	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
+	cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
 			      txq->mark_fmt);
 
 	if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
@@ -808,7 +812,7 @@  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	}
 
 	if (flags & NIX_TX_MULTI_SEG_F) {
-		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		const uint16_t segdw = cn9k_nix_prepare_mseg(txq, m, cmd, flags);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
 					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index 4658713591..61278bb72c 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -50,6 +50,7 @@  nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
 {
 	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
 	uint64_t conf = dev->tx_offloads;
+	struct roc_nix *nix = &dev->nix;
 	uint16_t flags = 0;
 
 	/* Fastpath is dependent on these enums */
@@ -113,6 +114,9 @@  nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
 	if (dev->tx_mark)
 		flags |= NIX_TX_OFFLOAD_VLAN_QINQ_F;
 
+	if (nix->tx_compl_ena)
+		flags |= NIX_TX_OFFLOAD_MBUF_NOFF_F;
+
 	return flags;
 }
 
@@ -165,6 +169,49 @@  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn10k_eth_txq *txq,
 	rte_wmb();
 }
 
+static int
+cn10k_nix_tx_compl_setup(struct cnxk_eth_dev *dev,
+		struct cn10k_eth_txq *txq,
+		struct roc_nix_sq *sq, uint16_t nb_desc)
+{
+	struct roc_nix_cq *cq;
+
+	cq = &dev->cqs[sq->cqid];
+	txq->tx_compl.desc_base = (uintptr_t)cq->desc_base;
+	txq->tx_compl.cq_door = cq->door;
+	txq->tx_compl.cq_status = cq->status;
+	txq->tx_compl.wdata = cq->wdata;
+	txq->tx_compl.head = cq->head;
+	txq->tx_compl.qmask = cq->qmask;
+	/* Total array size holding buffers is equal to
+	 * number of entries in cq and sq
+	 * max buffer in array = desc in cq + desc in sq
+	 */
+	txq->tx_compl.nb_desc_mask = (2 * rte_align32pow2(nb_desc)) - 1;
+	txq->tx_compl.ena = true;
+
+	txq->tx_compl.ptr = (struct rte_mbuf **)plt_zmalloc(txq->tx_compl.nb_desc_mask *
+			sizeof(struct rte_mbuf *), 0);
+	if (!txq->tx_compl.ptr)
+		return -1;
+
+	return 0;
+}
+
+static void
+cn10k_nix_tx_queue_release(struct rte_eth_dev *eth_dev, uint16_t qid)
+{
+	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
+	struct cn10k_eth_txq *txq;
+
+	cnxk_nix_tx_queue_release(eth_dev, qid);
+	txq = eth_dev->data->tx_queues[qid];
+
+	if (nix->tx_compl_ena)
+		plt_free(txq->tx_compl.ptr);
+}
+
 static int
 cn10k_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 			 uint16_t nb_desc, unsigned int socket,
@@ -191,6 +238,12 @@  cn10k_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 	/* Update fast path queue */
 	txq = eth_dev->data->tx_queues[qid];
 	txq->fc_mem = sq->fc;
+	if (nix->tx_compl_ena) {
+		rc = cn10k_nix_tx_compl_setup(dev, txq, sq, nb_desc);
+		if (rc)
+			return rc;
+	}
+
 	/* Store lmt base in tx queue for easy access */
 	txq->lmt_base = nix->lmt_base;
 	txq->io_addr = sq->io_addr;
@@ -711,6 +764,7 @@  nix_eth_dev_ops_override(void)
 	cnxk_eth_dev_ops.dev_configure = cn10k_nix_configure;
 	cnxk_eth_dev_ops.tx_queue_setup = cn10k_nix_tx_queue_setup;
 	cnxk_eth_dev_ops.rx_queue_setup = cn10k_nix_rx_queue_setup;
+	cnxk_eth_dev_ops.tx_queue_release = cn10k_nix_tx_queue_release;
 	cnxk_eth_dev_ops.tx_queue_stop = cn10k_nix_tx_queue_stop;
 	cnxk_eth_dev_ops.dev_start = cn10k_nix_dev_start;
 	cnxk_eth_dev_ops.dev_ptypes_set = cn10k_nix_ptypes_set;
diff --git a/drivers/net/cnxk/cn10k_ethdev.h b/drivers/net/cnxk/cn10k_ethdev.h
index 948c8348ad..c843ba9881 100644
--- a/drivers/net/cnxk/cn10k_ethdev.h
+++ b/drivers/net/cnxk/cn10k_ethdev.h
@@ -24,6 +24,7 @@  struct cn10k_eth_txq {
 	uint64_t ts_mem;
 	uint64_t mark_flag : 8;
 	uint64_t mark_fmt : 48;
+	struct cnxk_eth_txq_comp tx_compl;
 } __plt_cache_aligned;
 
 struct cn10k_eth_rxq {
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 09c332b2b5..c51de742ad 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -643,6 +643,28 @@  cn10k_nix_prep_sec(struct rte_mbuf *m, uint64_t *cmd, uintptr_t *nixtx_addr,
 }
 #endif
 
+static __rte_always_inline uint64_t
+cn10k_nix_prefree_seg(struct rte_mbuf *m, struct cn10k_eth_txq *txq,
+		struct nix_send_hdr_s *send_hdr)
+{
+	uint32_t sqe_id;
+
+	if (RTE_MBUF_HAS_EXTBUF(m)) {
+		if (send_hdr->w0.pnc) {
+			txq->tx_compl.ptr[send_hdr->w1.sqe_id]->next = m;
+		} else {
+			sqe_id = __atomic_fetch_add(&txq->tx_compl.sqe_id, 1, __ATOMIC_RELAXED);
+			send_hdr->w0.pnc = 1;
+			send_hdr->w1.sqe_id = sqe_id &
+				txq->tx_compl.nb_desc_mask;
+			txq->tx_compl.ptr[send_hdr->w1.sqe_id] = m;
+		}
+		return 1;
+	} else {
+		return cnxk_nix_prefree_seg(m);
+	}
+}
+
 static __rte_always_inline void
 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 {
@@ -696,7 +718,8 @@  cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 }
 
 static __rte_always_inline void
-cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
+cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
+		       struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		       const uint64_t lso_tun_fmt, bool *sec, uint8_t mark_flag,
 		       uint64_t mark_fmt)
 {
@@ -888,7 +911,7 @@  cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 			 *		is greater than 1
 			 * DF bit = 0 otherwise
 			 */
-			send_hdr->w0.df = cnxk_nix_prefree_seg(m);
+			send_hdr->w0.df = cn10k_nix_prefree_seg(m, txq, send_hdr);
 		}
 		/* Mark mempool object as "put" since it is freed by NIX */
 		if (!send_hdr->w0.df)
@@ -959,7 +982,8 @@  cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
 }
 
 static __rte_always_inline uint16_t
-cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
+		       struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 {
 	uint64_t prefree = 0, aura0, aura, nb_segs, segdw;
 	struct nix_send_hdr_s *send_hdr;
@@ -993,7 +1017,7 @@  cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 
 	/* Set invert df if buffer is not to be freed by H/W */
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-		prefree = cnxk_nix_prefree_seg(m);
+		prefree = cn10k_nix_prefree_seg(m, txq, send_hdr);
 		l_sg.i1 = prefree;
 	}
 
@@ -1035,7 +1059,7 @@  cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
-			prefree = cnxk_nix_prefree_seg(m);
+			prefree = cn10k_nix_prefree_seg(m, txq, send_hdr);
 			is_sg2 = aura != aura0 && !prefree;
 		}
 
@@ -1119,6 +1143,83 @@  cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 	return segdw;
 }
 
+static inline uint16_t
+nix_tx_compl_nb_pkts(struct cn10k_eth_txq *txq, const uint64_t wdata,
+		const uint16_t pkts, const uint32_t qmask)
+{
+	uint32_t available = txq->tx_compl.available;
+
+	/* Update the available count if cached value is not enough */
+	if (unlikely(available < pkts)) {
+		uint64_t reg, head, tail;
+
+		/* Use LDADDA version to avoid reorder */
+		reg = roc_atomic64_add_sync(wdata, txq->tx_compl.cq_status);
+		/* CQ_OP_STATUS operation error */
+		if (reg & BIT_ULL(NIX_CQ_OP_STAT_OP_ERR) ||
+				reg & BIT_ULL(NIX_CQ_OP_STAT_CQ_ERR))
+			return 0;
+
+		tail = reg & 0xFFFFF;
+		head = (reg >> 20) & 0xFFFFF;
+		if (tail < head)
+			available = tail - head + qmask + 1;
+		else
+			available = tail - head;
+
+		txq->tx_compl.available = available;
+	}
+	return RTE_MIN(pkts, available);
+}
+
+static inline void
+handle_tx_completion_pkts(struct cn10k_eth_txq *txq, const uint16_t pkts,
+			  uint8_t mt_safe)
+{
+#define CNXK_NIX_CQ_ENTRY_SZ 128
+#define CQE_SZ(x)            ((x) * CNXK_NIX_CQ_ENTRY_SZ)
+
+	uint16_t tx_pkts = 0, nb_pkts;
+	const uintptr_t desc = txq->tx_compl.desc_base;
+	const uint64_t wdata = txq->tx_compl.wdata;
+	const uint32_t qmask = txq->tx_compl.qmask;
+	uint32_t head = txq->tx_compl.head;
+	struct nix_cqe_hdr_s *tx_compl_cq;
+	struct nix_send_comp_s *tx_compl_s0;
+	struct rte_mbuf *m_next, *m;
+
+	if (mt_safe)
+		rte_spinlock_lock(&txq->tx_compl.ext_buf_lock);
+
+	nb_pkts = nix_tx_compl_nb_pkts(txq, wdata, pkts, qmask);
+	while (tx_pkts < nb_pkts) {
+		rte_prefetch_non_temporal((void *)(desc +
+					(CQE_SZ((head + 2) & qmask))));
+		tx_compl_cq = (struct nix_cqe_hdr_s *)
+			(desc + CQE_SZ(head));
+		tx_compl_s0 = (struct nix_send_comp_s *)
+			((uint64_t *)tx_compl_cq + 1);
+		m = txq->tx_compl.ptr[tx_compl_s0->sqe_id];
+		while (m->next != NULL) {
+			m_next = m->next;
+			rte_pktmbuf_free_seg(m);
+			m = m_next;
+		}
+		rte_pktmbuf_free_seg(m);
+
+		head++;
+		head &= qmask;
+		tx_pkts++;
+	}
+	txq->tx_compl.head = head;
+	txq->tx_compl.available -= nb_pkts;
+
+	plt_write64((wdata | nb_pkts), txq->tx_compl.cq_door);
+
+	if (mt_safe)
+		rte_spinlock_unlock(&txq->tx_compl.ext_buf_lock);
+}
+
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 		    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
@@ -1139,6 +1240,9 @@  cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 	uint64_t data;
 	bool sec;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, flags & NIX_TX_VWQE_F);
+
 	if (!(flags & NIX_TX_VWQE_F)) {
 		NIX_XMIT_FC_OR_RETURN(txq, pkts);
 		/* Reduce the cached count */
@@ -1181,7 +1285,7 @@  cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 		if (flags & NIX_TX_OFFLOAD_TSO_F)
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
 
-		cn10k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
 				       &sec, mark_flag, mark_fmt);
 
 		laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1285,6 +1389,9 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 	uintptr_t laddr;
 	bool sec;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, flags & NIX_TX_VWQE_F);
+
 	if (!(flags & NIX_TX_VWQE_F)) {
 		NIX_XMIT_FC_OR_RETURN(txq, pkts);
 		/* Reduce the cached count */
@@ -1331,7 +1438,7 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		if (flags & NIX_TX_OFFLOAD_TSO_F)
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
 
-		cn10k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
 				       &sec, mark_flag, mark_fmt);
 
 		laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1345,7 +1452,7 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
 		/* Store sg list directly on lmt line */
-		segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)laddr,
+		segdw = cn10k_nix_prepare_mseg(txq, tx_pkts[i], (uint64_t *)laddr,
 					       flags);
 		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
 					      segdw, flags);
@@ -1467,7 +1574,8 @@  cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 }
 
 static __rte_always_inline uint16_t
-cn10k_nix_prepare_mseg_vec_noff(struct rte_mbuf *m, uint64_t *cmd,
+cn10k_nix_prepare_mseg_vec_noff(struct cn10k_eth_txq *txq,
+				struct rte_mbuf *m, uint64_t *cmd,
 				uint64x2_t *cmd0, uint64x2_t *cmd1,
 				uint64x2_t *cmd2, uint64x2_t *cmd3,
 				const uint32_t flags)
@@ -1482,7 +1590,7 @@  cn10k_nix_prepare_mseg_vec_noff(struct rte_mbuf *m, uint64_t *cmd,
 		vst1q_u64(cmd + 2, *cmd1); /* sg */
 	}
 
-	segdw = cn10k_nix_prepare_mseg(m, cmd, flags);
+	segdw = cn10k_nix_prepare_mseg(txq, m, cmd, flags);
 
 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
 		vst1q_u64(cmd + segdw * 2 - 2, *cmd3);
@@ -1581,7 +1689,8 @@  cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
 #define NIX_DESCS_PER_LOOP 4
 
 static __rte_always_inline uint8_t
-cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
+cn10k_nix_prep_lmt_mseg_vector(struct cn10k_eth_txq *txq,
+			       struct rte_mbuf **mbufs, uint64x2_t *cmd0,
 			       uint64x2_t *cmd1, uint64x2_t *cmd2,
 			       uint64x2_t *cmd3, uint8_t *segdw,
 			       uint64_t *lmt_addr, __uint128_t *data128,
@@ -1599,7 +1708,7 @@  cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
 				lmt_addr += 16;
 				off = 0;
 			}
-			off += cn10k_nix_prepare_mseg_vec_noff(mbufs[j],
+			off += cn10k_nix_prepare_mseg_vec_noff(txq, mbufs[j],
 					lmt_addr + off * 2, &cmd0[j], &cmd1[j],
 					&cmd2[j], &cmd3[j], flags);
 		}
@@ -1741,14 +1850,15 @@  cn10k_nix_lmt_next(uint8_t dw, uintptr_t laddr, uint8_t *lnum, uint8_t *loff,
 }
 
 static __rte_always_inline void
-cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
+cn10k_nix_xmit_store(struct cn10k_eth_txq *txq,
+		     struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
 		     uint64x2_t cmd0, uint64x2_t cmd1, uint64x2_t cmd2,
 		     uint64x2_t cmd3, const uint16_t flags)
 {
 	uint8_t off;
 
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-		cn10k_nix_prepare_mseg_vec_noff(mbuf, LMT_OFF(laddr, 0, 0),
+		cn10k_nix_prepare_mseg_vec_noff(txq, mbuf, LMT_OFF(laddr, 0, 0),
 						&cmd0, &cmd1, &cmd2, &cmd3,
 						flags);
 		return;
@@ -1816,9 +1926,12 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
+	uint64x2_t xmask01_w0, xmask23_w0;
+	uint64x2_t xmask01_w1, xmask23_w1;
 	rte_iova_t io_addr = txq->io_addr;
 	uintptr_t laddr = txq->lmt_base;
 	uint8_t c_lnum, c_shft, c_loff;
+	struct nix_send_hdr_s send_hdr;
 	uint64x2_t ltypes01, ltypes23;
 	uint64x2_t xtmp128, ytmp128;
 	uint64x2_t xmask01, xmask23;
@@ -1831,6 +1944,9 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		uint64_t data[2];
 	} wd;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, flags & NIX_TX_VWQE_F);
+
 	if (!(flags & NIX_TX_VWQE_F)) {
 		NIX_XMIT_FC_OR_RETURN(txq, pkts);
 		scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
@@ -2664,8 +2780,10 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		    !(flags & NIX_TX_MULTI_SEG_F) &&
 		    !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
 			/* Set don't free bit if reference count > 1 */
-			xmask01 = vdupq_n_u64(0);
-			xmask23 = xmask01;
+			xmask01_w0 = vdupq_n_u64(0);
+			xmask01_w1 = vdupq_n_u64(0);
+			xmask23_w0 = xmask01_w0;
+			xmask23_w1 = xmask01_w1;
 
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
@@ -2673,35 +2791,62 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			mbuf2 = (uint64_t *)tx_pkts[2];
 			mbuf3 = (uint64_t *)tx_pkts[3];
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
-				xmask01 = vsetq_lane_u64(0x80000, xmask01, 0);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn10k_nix_prefree_seg((struct rte_mbuf *)mbuf0, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask01_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask01_w0, 0);
+				xmask01_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask01_w1, 0);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf0)->pool,
 					(void **)&mbuf0, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
-				xmask01 = vsetq_lane_u64(0x80000, xmask01, 1);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn10k_nix_prefree_seg((struct rte_mbuf *)mbuf1, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask01_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask01_w0, 1);
+				xmask01_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask01_w1, 1);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf1)->pool,
 					(void **)&mbuf1, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
-				xmask23 = vsetq_lane_u64(0x80000, xmask23, 0);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn10k_nix_prefree_seg((struct rte_mbuf *)mbuf2, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask23_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask23_w0, 0);
+				xmask23_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask23_w1, 0);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf2)->pool,
 					(void **)&mbuf2, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
-				xmask23 = vsetq_lane_u64(0x80000, xmask23, 1);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn10k_nix_prefree_seg((struct rte_mbuf *)mbuf3, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask23_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask23_w0, 1);
+				xmask23_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask23_w1, 1);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf3)->pool,
 					(void **)&mbuf3, 1, 0);
-			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
-			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
+			}
+
+			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01_w0);
+			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23_w0);
+			senddesc01_w1 = vorrq_u64(senddesc01_w1, xmask01_w1);
+			senddesc23_w1 = vorrq_u64(senddesc23_w1, xmask23_w1);
 		} else if (!(flags & NIX_TX_MULTI_SEG_F) &&
 			   !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
 			/* Move mbufs to iova */
@@ -2773,7 +2918,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);
 
 			/* Store mbuf0 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(tx_pkts[0], segdw[0], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[0], segdw[0], next,
 					     cmd0[0], cmd1[0], cmd2[0], cmd3[0],
 					     flags);
 
@@ -2789,7 +2934,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);
 
 			/* Store mbuf1 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(tx_pkts[1], segdw[1], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[1], segdw[1], next,
 					     cmd0[1], cmd1[1], cmd2[1], cmd3[1],
 					     flags);
 
@@ -2805,7 +2950,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);
 
 			/* Store mbuf2 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(tx_pkts[2], segdw[2], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[2], segdw[2], next,
 					     cmd0[2], cmd1[2], cmd2[2], cmd3[2],
 					     flags);
 
@@ -2821,7 +2966,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);
 
 			/* Store mbuf3 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(tx_pkts[3], segdw[3], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[3], segdw[3], next,
 					     cmd0[3], cmd1[3], cmd2[3], cmd3[3],
 					     flags);
 
@@ -2829,7 +2974,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			uint8_t j;
 
 			segdw[4] = 8;
-			j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
+			j = cn10k_nix_prep_lmt_mseg_vector(txq, tx_pkts, cmd0, cmd1,
 							  cmd2, cmd3, segdw,
 							  (uint64_t *)
 							  LMT_OFF(laddr, lnum,
diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index 3b702d9696..749214cf23 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -50,6 +50,7 @@  nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
 {
 	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
 	uint64_t conf = dev->tx_offloads;
+	struct roc_nix *nix = &dev->nix;
 	uint16_t flags = 0;
 
 	/* Fastpath is dependent on these enums */
@@ -113,6 +114,9 @@  nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
 	if (dev->tx_mark)
 		flags |= NIX_TX_OFFLOAD_VLAN_QINQ_F;
 
+	if (nix->tx_compl_ena)
+		flags |= NIX_TX_OFFLOAD_MBUF_NOFF_F;
+
 	return flags;
 }
 
@@ -165,12 +169,56 @@  nix_form_default_desc(struct cnxk_eth_dev *dev, struct cn9k_eth_txq *txq,
 	rte_wmb();
 }
 
+static int
+cn9k_nix_tx_compl_setup(struct cnxk_eth_dev *dev,
+		struct cn9k_eth_txq *txq,
+		struct roc_nix_sq *sq, uint16_t nb_desc)
+{
+	struct roc_nix_cq *cq;
+
+	cq = &dev->cqs[sq->cqid];
+	txq->tx_compl.desc_base = (uintptr_t)cq->desc_base;
+	txq->tx_compl.cq_door = cq->door;
+	txq->tx_compl.cq_status = cq->status;
+	txq->tx_compl.wdata = cq->wdata;
+	txq->tx_compl.head = cq->head;
+	txq->tx_compl.qmask = cq->qmask;
+	/* Total array size holding buffers is equal to
+	 * number of entries in cq and sq
+	 * max buffer in array = desc in cq + desc in sq
+	 */
+	txq->tx_compl.nb_desc_mask = (2 * rte_align32pow2(nb_desc)) - 1;
+	txq->tx_compl.ena = true;
+
+	txq->tx_compl.ptr = (struct rte_mbuf **)plt_zmalloc(txq->tx_compl.nb_desc_mask *
+			sizeof(struct rte_mbuf *), 0);
+	if (!txq->tx_compl.ptr)
+		return -1;
+
+	return 0;
+}
+
+static void
+cn9k_nix_tx_queue_release(struct rte_eth_dev *eth_dev, uint16_t qid)
+{
+	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
+	struct cn9k_eth_txq *txq;
+
+	cnxk_nix_tx_queue_release(eth_dev, qid);
+	txq = eth_dev->data->tx_queues[qid];
+
+	if (nix->tx_compl_ena)
+		plt_free(txq->tx_compl.ptr);
+}
+
 static int
 cn9k_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 			uint16_t nb_desc, unsigned int socket,
 			const struct rte_eth_txconf *tx_conf)
 {
 	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
 	uint64_t mark_fmt, mark_flag;
 	struct roc_cpt_lf *inl_lf;
 	struct cn9k_eth_txq *txq;
@@ -190,6 +238,11 @@  cn9k_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 	/* Update fast path queue */
 	txq = eth_dev->data->tx_queues[qid];
 	txq->fc_mem = sq->fc;
+	if (nix->tx_compl_ena) {
+		rc = cn9k_nix_tx_compl_setup(dev, txq, sq, nb_desc);
+		if (rc)
+			return rc;
+	}
 	txq->lmt_addr = sq->lmt_addr;
 	txq->io_addr = sq->io_addr;
 	txq->nb_sqb_bufs_adj = sq->nb_sqb_bufs_adj;
@@ -634,6 +687,7 @@  nix_eth_dev_ops_override(void)
 	/* Update platform specific ops */
 	cnxk_eth_dev_ops.dev_configure = cn9k_nix_configure;
 	cnxk_eth_dev_ops.tx_queue_setup = cn9k_nix_tx_queue_setup;
+	cnxk_eth_dev_ops.tx_queue_release = cn9k_nix_tx_queue_release;
 	cnxk_eth_dev_ops.rx_queue_setup = cn9k_nix_rx_queue_setup;
 	cnxk_eth_dev_ops.tx_queue_stop = cn9k_nix_tx_queue_stop;
 	cnxk_eth_dev_ops.dev_start = cn9k_nix_dev_start;
diff --git a/drivers/net/cnxk/cn9k_ethdev.h b/drivers/net/cnxk/cn9k_ethdev.h
index 472a4b06da..a82dcb3d19 100644
--- a/drivers/net/cnxk/cn9k_ethdev.h
+++ b/drivers/net/cnxk/cn9k_ethdev.h
@@ -24,6 +24,7 @@  struct cn9k_eth_txq {
 	uint16_t cpt_desc;
 	uint64_t mark_flag : 8;
 	uint64_t mark_fmt : 48;
+	struct cnxk_eth_txq_comp tx_compl;
 } __plt_cache_aligned;
 
 struct cn9k_eth_rxq {
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 404edd6aed..17bbdce3a0 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -81,6 +81,28 @@  cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
 	}
 }
 
+static __rte_always_inline uint64_t
+cn9k_nix_prefree_seg(struct rte_mbuf *m, struct cn9k_eth_txq *txq,
+		struct nix_send_hdr_s *send_hdr)
+{
+	uint32_t sqe_id;
+
+	if (RTE_MBUF_HAS_EXTBUF(m)) {
+		if (send_hdr->w0.pnc) {
+			txq->tx_compl.ptr[send_hdr->w1.sqe_id]->next = m;
+		} else {
+			sqe_id = __atomic_fetch_add(&txq->tx_compl.sqe_id, 1, __ATOMIC_RELAXED);
+			send_hdr->w0.pnc = 1;
+			send_hdr->w1.sqe_id = sqe_id &
+				txq->tx_compl.nb_desc_mask;
+			txq->tx_compl.ptr[send_hdr->w1.sqe_id] = m;
+		}
+		return 1;
+	} else {
+		return cnxk_nix_prefree_seg(m);
+	}
+}
+
 static __rte_always_inline void
 cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 {
@@ -134,7 +156,8 @@  cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 }
 
 static __rte_always_inline void
-cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
+cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq,
+		      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 		      const uint64_t lso_tun_fmt, uint8_t mark_flag,
 		      uint64_t mark_fmt)
 {
@@ -325,7 +348,7 @@  cn9k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
 			 *		is greater than 1
 			 * DF bit = 0 otherwise
 			 */
-			send_hdr->w0.df = cnxk_nix_prefree_seg(m);
+			send_hdr->w0.df = cn9k_nix_prefree_seg(m, txq, send_hdr);
 			/* Ensuring mbuf fields which got updated in
 			 * cnxk_nix_prefree_seg are written before LMTST.
 			 */
@@ -401,7 +424,8 @@  cn9k_nix_xmit_submit_lmt_release(const rte_iova_t io_addr)
 }
 
 static __rte_always_inline uint16_t
-cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
+		      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 {
 	struct nix_send_hdr_s *send_hdr;
 	union nix_send_sg_s *sg;
@@ -429,7 +453,7 @@  cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 
 	/* Set invert df if buffer is not to be freed by H/W */
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr) << 55);
 		rte_io_wmb();
 	}
 
@@ -450,7 +474,7 @@  cn9k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 		*slist = rte_mbuf_data_iova(m);
 		/* Set invert df if buffer is not to be freed by H/W */
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr) << (i + 55));
 			/* Commit changes to mbuf */
 			rte_io_wmb();
 		}
@@ -520,6 +544,83 @@  cn9k_nix_xmit_mseg_one_release(uint64_t *cmd, void *lmt_addr,
 	} while (lmt_status == 0);
 }
 
+static inline uint16_t
+nix_tx_compl_nb_pkts(struct cn9k_eth_txq *txq, const uint64_t wdata,
+		const uint16_t pkts, const uint32_t qmask)
+{
+	uint32_t available = txq->tx_compl.available;
+
+	/* Update the available count if cached value is not enough */
+	if (unlikely(available < pkts)) {
+		uint64_t reg, head, tail;
+
+		/* Use LDADDA version to avoid reorder */
+		reg = roc_atomic64_add_sync(wdata, txq->tx_compl.cq_status);
+		/* CQ_OP_STATUS operation error */
+		if (reg & BIT_ULL(NIX_CQ_OP_STAT_OP_ERR) ||
+				reg & BIT_ULL(NIX_CQ_OP_STAT_CQ_ERR))
+			return 0;
+
+		tail = reg & 0xFFFFF;
+		head = (reg >> 20) & 0xFFFFF;
+		if (tail < head)
+			available = tail - head + qmask + 1;
+		else
+			available = tail - head;
+
+		txq->tx_compl.available = available;
+	}
+	return RTE_MIN(pkts, available);
+}
+
+static inline void
+handle_tx_completion_pkts(struct cn9k_eth_txq *txq, const uint16_t pkts,
+			  uint8_t mt_safe)
+{
+#define CNXK_NIX_CQ_ENTRY_SZ 128
+#define CQE_SZ(x)            ((x) * CNXK_NIX_CQ_ENTRY_SZ)
+
+	uint16_t tx_pkts = 0, nb_pkts;
+	const uintptr_t desc = txq->tx_compl.desc_base;
+	const uint64_t wdata = txq->tx_compl.wdata;
+	const uint32_t qmask = txq->tx_compl.qmask;
+	uint32_t head = txq->tx_compl.head;
+	struct nix_cqe_hdr_s *tx_compl_cq;
+	struct nix_send_comp_s *tx_compl_s0;
+	struct rte_mbuf *m_next, *m;
+
+	if (mt_safe)
+		rte_spinlock_lock(&txq->tx_compl.ext_buf_lock);
+
+	nb_pkts = nix_tx_compl_nb_pkts(txq, wdata, pkts, qmask);
+	while (tx_pkts < nb_pkts) {
+		rte_prefetch_non_temporal((void *)(desc +
+					(CQE_SZ((head + 2) & qmask))));
+		tx_compl_cq = (struct nix_cqe_hdr_s *)
+			(desc + CQE_SZ(head));
+		tx_compl_s0 = (struct nix_send_comp_s *)
+			((uint64_t *)tx_compl_cq + 1);
+		m = txq->tx_compl.ptr[tx_compl_s0->sqe_id];
+		while (m->next != NULL) {
+			m_next = m->next;
+			rte_pktmbuf_free_seg(m);
+			m = m_next;
+		}
+		rte_pktmbuf_free_seg(m);
+
+		head++;
+		head &= qmask;
+		tx_pkts++;
+	}
+	txq->tx_compl.head = head;
+	txq->tx_compl.available -= nb_pkts;
+
+	plt_write64((wdata | nb_pkts), txq->tx_compl.cq_door);
+
+	if (mt_safe)
+		rte_spinlock_unlock(&txq->tx_compl.ext_buf_lock);
+}
+
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		   uint64_t *cmd, const uint16_t flags)
@@ -531,6 +632,9 @@  cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 	uint8_t mark_flag = 0;
 	uint16_t i;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, 0);
+
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
@@ -555,7 +659,7 @@  cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		rte_io_wmb();
 
 	for (i = 0; i < pkts; i++) {
-		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
 				      mark_flag, mark_fmt);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
 					     flags);
@@ -580,6 +684,9 @@  cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t segdw;
 	uint64_t i;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, 0);
+
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 1);
@@ -604,9 +711,9 @@  cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 		rte_io_wmb();
 
 	for (i = 0; i < pkts; i++) {
-		cn9k_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
 				      mark_flag, mark_fmt);
-		segdw = cn9k_nix_prepare_mseg(tx_pkts[i], cmd, flags);
+		segdw = cn9k_nix_prepare_mseg(txq, tx_pkts[i], cmd, flags);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
 					     segdw, flags);
 		cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
@@ -658,8 +765,9 @@  cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 }
 
 static __rte_always_inline uint8_t
-cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
-			       union nix_send_hdr_w0_u *sh,
+cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
+			       struct rte_mbuf *m, uint64_t *cmd,
+			       struct nix_send_hdr_s *send_hdr,
 			       union nix_send_sg_s *sg, const uint32_t flags)
 {
 	struct rte_mbuf *m_next;
@@ -668,7 +776,7 @@  cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
 	uint64_t segdw;
 	int i = 1;
 
-	sh->total = m->pkt_len;
+	send_hdr->w0.total = m->pkt_len;
 	/* Clear sg->u header before use */
 	sg->u &= 0xFC00000000000000;
 	sg_u = sg->u;
@@ -681,7 +789,7 @@  cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
 
 	/* Set invert df if buffer is not to be freed by H/W */
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
-		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr) << 55);
 		/* Mark mempool object as "put" since it is freed by NIX */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 	if (!(sg_u & (1ULL << 55)))
@@ -697,7 +805,7 @@  cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
 		*slist = rte_mbuf_data_iova(m);
 		/* Set invert df if buffer is not to be freed by H/W */
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
-			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr) << (i + 55));
 			/* Mark mempool object as "put" since it is freed by NIX
 			 */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -731,24 +839,29 @@  cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
 	/* Default dwords */
 	segdw += 1 + !!(flags & NIX_TX_NEED_EXT_HDR) +
 		 !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
-	sh->sizem1 = segdw - 1;
+	send_hdr->w0.sizem1 = segdw - 1;
 
 	return segdw;
 }
 
 static __rte_always_inline uint8_t
-cn9k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
+cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
+			  struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
 			  uint64x2_t *cmd1, const uint32_t flags)
 {
-	union nix_send_hdr_w0_u sh;
+	struct nix_send_hdr_s send_hdr;
 	union nix_send_sg_s sg;
 	uint8_t ret;
 
 	if (m->nb_segs == 1) {
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+			send_hdr.w0.u = vgetq_lane_u64(cmd0[0], 0);
+			send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
 			sg.u = vgetq_lane_u64(cmd1[0], 0);
-			sg.u |= (cnxk_nix_prefree_seg(m) << 55);
+			sg.u |= (cn9k_nix_prefree_seg(m, txq, &send_hdr) << 55);
 			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+			cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
+			cmd0[0] = vsetq_lane_u64(send_hdr.w1.u, cmd0[0], 1);
 		}
 
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -761,12 +874,14 @@  cn9k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
 		       !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
 	}
 
-	sh.u = vgetq_lane_u64(cmd0[0], 0);
+	send_hdr.w0.u = vgetq_lane_u64(cmd0[0], 0);
+	send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
 	sg.u = vgetq_lane_u64(cmd1[0], 0);
 
-	ret = cn9k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
+	ret = cn9k_nix_prepare_mseg_vec_list(txq, m, cmd, &send_hdr, &sg, flags);
 
-	cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
+	cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
+	cmd0[0] = vsetq_lane_u64(send_hdr.w1.u, cmd0[0], 1);
 	cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
 	return ret;
 }
@@ -908,13 +1023,19 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
 	uint64_t *lmt_addr = txq->lmt_addr;
+	uint64x2_t xmask01_w0, xmask23_w0;
+	uint64x2_t xmask01_w1, xmask23_w1;
 	rte_iova_t io_addr = txq->io_addr;
+	struct nix_send_hdr_s send_hdr;
 	uint64x2_t ltypes01, ltypes23;
 	uint64x2_t xtmp128, ytmp128;
 	uint64x2_t xmask01, xmask23;
 	uint64_t lmt_status, i;
 	uint16_t pkts_left;
 
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+		handle_tx_completion_pkts(txq, pkts, 0);
+
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
 	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
@@ -1672,8 +1793,10 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
 		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
-			xmask01 = vdupq_n_u64(0);
-			xmask23 = xmask01;
+			xmask01_w0 = vdupq_n_u64(0);
+			xmask01_w1 = vdupq_n_u64(0);
+			xmask23_w0 = xmask01_w0;
+			xmask23_w1 = xmask01_w1;
 
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
@@ -1681,35 +1804,63 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			mbuf2 = (uint64_t *)tx_pkts[2];
 			mbuf3 = (uint64_t *)tx_pkts[3];
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
-				xmask01 = vsetq_lane_u64(0x80000, xmask01, 0);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn9k_nix_prefree_seg((struct rte_mbuf *)mbuf0, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask01_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask01_w0, 0);
+				xmask01_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask01_w1, 0);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf0)->pool,
 					(void **)&mbuf0, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
-				xmask01 = vsetq_lane_u64(0x80000, xmask01, 1);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn9k_nix_prefree_seg((struct rte_mbuf *)mbuf1, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask01_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask01_w0, 1);
+				xmask01_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask01_w1, 1);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf1)->pool,
 					(void **)&mbuf1, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
-				xmask23 = vsetq_lane_u64(0x80000, xmask23, 0);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn9k_nix_prefree_seg((struct rte_mbuf *)mbuf2, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask23_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask23_w0, 0);
+				xmask23_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask23_w1, 0);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf2)->pool,
 					(void **)&mbuf2, 1, 0);
+			}
 
-			if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
-				xmask23 = vsetq_lane_u64(0x80000, xmask23, 1);
-			else
+			send_hdr.w0.u = 0;
+			send_hdr.w1.u = 0;
+
+			if (cn9k_nix_prefree_seg((struct rte_mbuf *)mbuf3, txq, &send_hdr)) {
+				send_hdr.w0.df = 1;
+				xmask23_w0 = vsetq_lane_u64(send_hdr.w0.u, xmask23_w0, 1);
+				xmask23_w1 = vsetq_lane_u64(send_hdr.w1.u, xmask23_w1, 1);
+			} else {
 				RTE_MEMPOOL_CHECK_COOKIES(
 					((struct rte_mbuf *)mbuf3)->pool,
 					(void **)&mbuf3, 1, 0);
-			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
-			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
+			}
+
+			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01_w0);
+			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23_w0);
+			senddesc01_w1 = vorrq_u64(senddesc01_w1, xmask01_w1);
+			senddesc23_w1 = vorrq_u64(senddesc23_w1, xmask23_w1);
+
 			/* Ensuring mbuf fields which got updated in
 			 * cnxk_nix_prefree_seg are written before LMTST.
 			 */
@@ -1769,7 +1920,8 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 			/* Build mseg list for each packet individually. */
 			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
-				segdw[j] = cn9k_nix_prepare_mseg_vec(tx_pkts[j],
+				segdw[j] = cn9k_nix_prepare_mseg_vec(txq,
+							tx_pkts[j],
 							seg_list[j], &cmd0[j],
 							&cmd1[j], flags);
 			segdw[4] = 8;
diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index 104aad7b51..1be2e9e776 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -455,7 +455,9 @@  cnxk_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 {
 	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
 	const struct eth_dev_ops *dev_ops = eth_dev->dev_ops;
+	struct roc_nix *nix = &dev->nix;
 	struct cnxk_eth_txq_sp *txq_sp;
+	struct roc_nix_cq *cq;
 	struct roc_nix_sq *sq;
 	size_t txq_sz;
 	int rc;
@@ -480,6 +482,19 @@  cnxk_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 	sq->max_sqe_sz = nix_sq_max_sqe_sz(dev);
 	sq->tc = ROC_NIX_PFC_CLASS_INVALID;
 
+	if (nix->tx_compl_ena) {
+		sq->cqid = sq->qid + dev->nb_rxq;
+		sq->cq_ena = 1;
+		cq = &dev->cqs[sq->cqid];
+		cq->qid = sq->cqid;
+		cq->nb_desc = nb_desc;
+		rc = roc_nix_cq_init(&dev->nix, cq);
+		if (rc) {
+			plt_err("Failed to init cq=%d, rc=%d", cq->qid, rc);
+			return rc;
+		}
+	}
+
 	rc = roc_nix_sq_init(&dev->nix, sq);
 	if (rc) {
 		plt_err("Failed to init sq=%d, rc=%d", qid, rc);
@@ -513,7 +528,7 @@  cnxk_nix_tx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 	return 0;
 }
 
-static void
+void
 cnxk_nix_tx_queue_release(struct rte_eth_dev *eth_dev, uint16_t qid)
 {
 	void *txq = eth_dev->data->tx_queues[qid];
@@ -1234,7 +1249,7 @@  cnxk_nix_configure(struct rte_eth_dev *eth_dev)
 	if (roc_nix_is_lbk(nix))
 		nix->enable_loop = eth_dev->data->dev_conf.lpbk_mode;
 
-	nix->tx_compl_ena = 0;
+	nix->tx_compl_ena = dev->tx_compl_ena;
 
 	/* Alloc a nix lf */
 	rc = roc_nix_lf_alloc(nix, nb_rxq, nb_txq, rx_cfg);
@@ -1277,6 +1292,15 @@  cnxk_nix_configure(struct rte_eth_dev *eth_dev)
 			goto free_nix_lf;
 		}
 		dev->sqs = qs;
+
+		if (nix->tx_compl_ena) {
+			qs = plt_zmalloc(sizeof(struct roc_nix_cq) * nb_txq, 0);
+			if (!qs) {
+				plt_err("Failed to alloc cqs");
+				goto free_nix_lf;
+			}
+			dev->cqs = qs;
+		}
 	}
 
 	/* Re-enable NIX LF error interrupts */
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index a86e9dba80..4ba40e52b3 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -152,6 +152,21 @@ 
 
 #define CNXK_TX_MARK_FMT_MASK (0xFFFFFFFFFFFFull)
 
+struct cnxk_eth_txq_comp {
+	uintptr_t desc_base;
+	uintptr_t cq_door;
+	int64_t *cq_status;
+	uint64_t wdata;
+	uint32_t head;
+	uint32_t qmask;
+	uint32_t nb_desc_mask;
+	uint32_t available;
+	uint32_t sqe_id;
+	bool ena;
+	struct rte_mbuf **ptr;
+	rte_spinlock_t ext_buf_lock;
+};
+
 struct cnxk_fc_cfg {
 	enum rte_eth_fc_mode mode;
 	uint8_t rx_pause;
@@ -366,6 +381,7 @@  struct cnxk_eth_dev {
 	uint16_t flags;
 	uint8_t ptype_disable;
 	bool scalar_ena;
+	bool tx_compl_ena;
 	bool tx_mark;
 	bool ptp_en;
 	bool rx_mark_update; /* Enable/Disable mark update to mbuf */
@@ -544,6 +560,7 @@  int cnxk_nix_rx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
 			    const struct rte_eth_rxconf *rx_conf,
 			    struct rte_mempool *mp);
 int cnxk_nix_tx_queue_start(struct rte_eth_dev *eth_dev, uint16_t qid);
+void cnxk_nix_tx_queue_release(struct rte_eth_dev *eth_dev, uint16_t qid);
 int cnxk_nix_tx_queue_stop(struct rte_eth_dev *eth_dev, uint16_t qid);
 int cnxk_nix_dev_start(struct rte_eth_dev *eth_dev);
 int cnxk_nix_timesync_enable(struct rte_eth_dev *eth_dev);
diff --git a/drivers/net/cnxk/cnxk_ethdev_devargs.c b/drivers/net/cnxk/cnxk_ethdev_devargs.c
index d28509dbda..dbf5bd847d 100644
--- a/drivers/net/cnxk/cnxk_ethdev_devargs.c
+++ b/drivers/net/cnxk/cnxk_ethdev_devargs.c
@@ -231,6 +231,7 @@  parse_sdp_channel_mask(const char *key, const char *value, void *extra_args)
 
 #define CNXK_RSS_RETA_SIZE	"reta_size"
 #define CNXK_SCL_ENABLE		"scalar_enable"
+#define CNXK_TX_COMPL_ENA       "tx_compl_ena"
 #define CNXK_MAX_SQB_COUNT	"max_sqb_count"
 #define CNXK_FLOW_PREALLOC_SIZE "flow_prealloc_size"
 #define CNXK_FLOW_MAX_PRIORITY	"flow_max_priority"
@@ -266,6 +267,7 @@  cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 	struct sdp_channel sdp_chan;
 	uint16_t rss_tag_as_xor = 0;
 	uint16_t scalar_enable = 0;
+	uint16_t tx_compl_ena = 0;
 	uint16_t custom_sa_act = 0;
 	struct rte_kvargs *kvlist;
 	uint16_t no_inl_dev = 0;
@@ -285,6 +287,8 @@  cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 			   &reta_sz);
 	rte_kvargs_process(kvlist, CNXK_SCL_ENABLE, &parse_flag,
 			   &scalar_enable);
+	rte_kvargs_process(kvlist, CNXK_TX_COMPL_ENA, &parse_flag,
+			   &tx_compl_ena);
 	rte_kvargs_process(kvlist, CNXK_MAX_SQB_COUNT, &parse_sqb_count,
 			   &sqb_count);
 	rte_kvargs_process(kvlist, CNXK_FLOW_PREALLOC_SIZE,
@@ -319,6 +323,7 @@  cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 
 null_devargs:
 	dev->scalar_ena = !!scalar_enable;
+	dev->tx_compl_ena = !!tx_compl_ena;
 	dev->inb.no_inl_dev = !!no_inl_dev;
 	dev->inb.min_spi = ipsec_in_min_spi;
 	dev->inb.max_spi = ipsec_in_max_spi;
@@ -349,6 +354,7 @@  cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 RTE_PMD_REGISTER_PARAM_STRING(net_cnxk,
 			      CNXK_RSS_RETA_SIZE "=<64|128|256>"
 			      CNXK_SCL_ENABLE "=1"
+			      CNXK_TX_COMPL_ENA "=1"
 			      CNXK_MAX_SQB_COUNT "=<8-512>"
 			      CNXK_FLOW_PREALLOC_SIZE "=<1-32>"
 			      CNXK_FLOW_MAX_PRIORITY "=<1-32>"