[v2,8/9] crypto/mlx5: add enqueue and dequeue operations

Message ID 20230526031422.913377-9-suanmingm@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series crypto/mlx5: support AES-GCM |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Suanming Mou May 26, 2023, 3:14 a.m. UTC
  The crypto operations are performed with crypto WQE. If the input
buffers(AAD, mbuf, digest) are not contiguous and there is no enough
headroom/tailroom for copying AAD/digest, as the requirement from FW,
an UMR WQE is needed to generate contiguous address space for crypto
WQE. The UMR WQE and crypto WQE are handled in two different QPs.

Crypto operation with non-contiguous buffers will have its own UMR
WQE, while the operation with contiguous buffers doesn't need the
UMR WQE. Once the all the operations WQE in the enqueue burst built
finishes, if any UMR WQEs are built, an additional SEND_EN WQE will
be as the final WQE of the burst in the UMR QP. The purpose of that
SEND_EN WQE is to trigger the crypto QP processing with the UMR ready
input memory address space buffers.

The QP for crypto operations contains only the crypto WQE and the QP
WQEs are built as fixed in QP setup. The QP processing is triggered
by doorbell ring or the SEND_EN WQE from UMR QP.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/common/mlx5/mlx5_prm.h        |   1 +
 drivers/crypto/mlx5/mlx5_crypto.c     |   9 +-
 drivers/crypto/mlx5/mlx5_crypto.h     |   8 +
 drivers/crypto/mlx5/mlx5_crypto_gcm.c | 588 ++++++++++++++++++++++++++
 4 files changed, 604 insertions(+), 2 deletions(-)
  

Patch

diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index a502e29bd8..98b71a4031 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -617,6 +617,7 @@  struct mlx5_wqe_send_en_wqe {
 /* MMO metadata segment */
 
 #define	MLX5_OPCODE_MMO	0x2fu
+#define	MLX5_OPC_MOD_MMO_CRYPTO 0x6u
 #define	MLX5_OPC_MOD_MMO_REGEX 0x4u
 #define	MLX5_OPC_MOD_MMO_COMP 0x2u
 #define	MLX5_OPC_MOD_MMO_DECOMP 0x3u
diff --git a/drivers/crypto/mlx5/mlx5_crypto.c b/drivers/crypto/mlx5/mlx5_crypto.c
index ff632cd69a..4d7d3ef2a3 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.c
+++ b/drivers/crypto/mlx5/mlx5_crypto.c
@@ -62,8 +62,13 @@  mlx5_crypto_dev_infos_get(struct rte_cryptodev *dev,
 			MLX5_CRYPTO_FEATURE_FLAGS(priv->is_wrapped_mode);
 		dev_info->capabilities = priv->caps;
 		dev_info->max_nb_queue_pairs = MLX5_CRYPTO_MAX_QPS;
-		dev_info->min_mbuf_headroom_req = 0;
-		dev_info->min_mbuf_tailroom_req = 0;
+		if (priv->caps->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD) {
+			dev_info->min_mbuf_headroom_req = MLX5_CRYPTO_GCM_MAX_AAD;
+			dev_info->min_mbuf_tailroom_req = MLX5_CRYPTO_GCM_MAX_DIGEST;
+		} else {
+			dev_info->min_mbuf_headroom_req = 0;
+			dev_info->min_mbuf_tailroom_req = 0;
+		}
 		dev_info->sym.max_nb_sessions = 0;
 		/*
 		 * If 0, the device does not have any limitation in number of
diff --git a/drivers/crypto/mlx5/mlx5_crypto.h b/drivers/crypto/mlx5/mlx5_crypto.h
index 88a09a6b1c..6dcb41b27c 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.h
+++ b/drivers/crypto/mlx5/mlx5_crypto.h
@@ -23,6 +23,8 @@ 
 #define MLX5_CRYPTO_KLM_SEGS_NUM(umr_wqe_sz) ((umr_wqe_sz -\
 					MLX5_CRYPTO_UMR_WQE_STATIC_SIZE) /\
 					MLX5_WSEG_SIZE)
+#define MLX5_CRYPTO_GCM_MAX_AAD 64
+#define MLX5_CRYPTO_GCM_MAX_DIGEST 16
 
 struct mlx5_crypto_priv {
 	TAILQ_ENTRY(mlx5_crypto_priv) next;
@@ -61,6 +63,9 @@  struct mlx5_crypto_qp {
 	uint8_t *wqe;
 	uint16_t entries_n;
 	uint16_t cq_entries_n;
+	uint16_t reported_ci;
+	uint16_t qp_ci;
+	uint16_t cq_ci;
 	uint16_t pi;
 	uint16_t ci;
 	uint16_t db_pi;
@@ -70,6 +75,9 @@  struct mlx5_crypto_qp {
 	uint16_t umr_pi;
 	uint16_t umr_ci;
 	uint32_t umr_errors;
+	uint16_t last_gga_pi;
+	bool has_umr;
+	uint16_t cpy_tag_op;
 };
 
 struct mlx5_crypto_dek {
diff --git a/drivers/crypto/mlx5/mlx5_crypto_gcm.c b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
index dfef5455b4..2231bcbe6f 100644
--- a/drivers/crypto/mlx5/mlx5_crypto_gcm.c
+++ b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
@@ -9,6 +9,7 @@ 
 #include <rte_log.h>
 #include <bus_pci_driver.h>
 #include <rte_memory.h>
+#include <rte_io.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
@@ -32,6 +33,40 @@ 
 	 RTE_ALIGN(sizeof(struct mlx5_wqe_send_en_wqe), \
 	 MLX5_SEND_WQE_BB))
 
+#define MLX5_UMR_GCM_WQE_STRIDE \
+	(MLX5_UMR_GCM_WQE_SIZE / MLX5_SEND_WQE_BB)
+
+#define MLX5_MMO_CRYPTO_OPC (MLX5_OPCODE_MMO | \
+	(MLX5_OPC_MOD_MMO_CRYPTO << WQE_CSEG_OPC_MOD_OFFSET))
+
+/*
+ * The status default value is RTE_CRYPTO_OP_STATUS_SUCCESS.
+ * Copy tag should fill different value to status.
+ */
+#define MLX5_CRYPTO_OP_STATUS_GCM_TAG_COPY (RTE_CRYPTO_OP_STATUS_SUCCESS + 1)
+
+struct mlx5_crypto_gcm_op_info {
+	bool need_umr;
+	bool is_oop;
+	bool is_enc;
+	void *digest;
+	void *src_addr;
+};
+
+struct mlx5_crypto_gcm_data {
+	void *src_addr;
+	uint32_t src_bytes;
+	void *dst_addr;
+	uint32_t dst_bytes;
+	uint32_t src_mkey;
+	uint32_t dst_mkey;
+};
+
+struct mlx5_crypto_gcm_tag_cpy_info {
+	void *digest;
+	uint8_t tag_len;
+} __rte_packed;
+
 static struct rte_cryptodev_capabilities mlx5_crypto_gcm_caps[] = {
 	{
 		.op = RTE_CRYPTO_OP_TYPE_UNDEFINED,
@@ -326,6 +361,557 @@  mlx5_crypto_gcm_qp_setup(struct rte_cryptodev *dev, uint16_t qp_id,
 	return -1;
 }
 
+static __rte_always_inline void
+mlx5_crypto_gcm_get_op_info(struct mlx5_crypto_qp *qp,
+			    struct rte_crypto_op *op,
+			    struct mlx5_crypto_gcm_op_info *op_info)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct rte_mbuf *m_src = op->sym->m_src;
+	void *aad_addr = op->sym->aead.aad.data;
+	void *tag_addr = op->sym->aead.digest.data;
+	void *src_addr = rte_pktmbuf_mtod_offset(m_src, void *, op->sym->aead.data.offset);
+	struct rte_mbuf *m_dst = m_src;
+	void *dst_addr = src_addr;
+	void *expected_aad = NULL;
+	void *expected_tag = NULL;
+	bool is_enc = sess->op_type == MLX5_CRYPTO_OP_TYPE_ENCRYPTION;
+	bool cp_aad = false;
+	bool cp_tag = false;
+
+	op_info->is_oop = false;
+	op_info->need_umr = false;
+	op_info->is_enc = is_enc;
+	op_info->digest = NULL;
+	op_info->src_addr = aad_addr;
+	if (op->sym->m_dst && op->sym->m_dst != m_src) {
+		op_info->is_oop = true;
+		m_dst = op->sym->m_dst;
+		dst_addr = rte_pktmbuf_mtod_offset(m_dst, void *, op->sym->aead.data.offset);
+		if (m_dst->nb_segs > 1) {
+			op_info->need_umr = true;
+			return;
+		}
+		/*
+		 * If the op's mbuf has extra data offset, don't copy AAD to
+		 * this area.
+		 */
+		if (rte_pktmbuf_headroom(m_dst) < sess->aad_len ||
+		    op->sym->aead.data.offset) {
+			op_info->need_umr = true;
+			return;
+		}
+	}
+	if (m_src->nb_segs > 1) {
+		op_info->need_umr = true;
+		return;
+	}
+	expected_aad = RTE_PTR_SUB(src_addr, sess->aad_len);
+	if (expected_aad != aad_addr) {
+		/*
+		 * If the op's mbuf has extra data offset, don't copy AAD to
+		 * this area.
+		 */
+		if (sess->aad_len > MLX5_CRYPTO_GCM_MAX_AAD ||
+		    sess->aad_len > rte_pktmbuf_headroom(m_src) ||
+		    op->sym->aead.data.offset) {
+			op_info->need_umr = true;
+			return;
+		}
+		cp_aad = true;
+		op_info->src_addr = expected_aad;
+	}
+	expected_tag = RTE_PTR_ADD(is_enc ? dst_addr : src_addr, op->sym->aead.data.length);
+	if (expected_tag != tag_addr) {
+		struct rte_mbuf *mbuf = is_enc ? m_dst : m_src;
+
+		/*
+		 * If op's mbuf is not fully set as payload, don't copy digest to
+		 * the left area.
+		 */
+		if (rte_pktmbuf_tailroom(mbuf) < sess->tag_len ||
+		    rte_pktmbuf_data_len(mbuf) != op->sym->aead.data.length) {
+			op_info->need_umr = true;
+			return;
+		}
+		if (is_enc) {
+			op_info->digest = expected_tag;
+			qp->cpy_tag_op++;
+		} else {
+			cp_tag = true;
+		}
+	}
+	if (cp_aad)
+		memcpy(expected_aad, aad_addr, sess->aad_len);
+	if (cp_tag)
+		memcpy(expected_tag, tag_addr, sess->tag_len);
+}
+
+static __rte_always_inline uint32_t
+_mlx5_crypto_gcm_umr_build_mbuf_klm(struct mlx5_crypto_qp *qp,
+				    struct rte_mbuf *mbuf,
+				    struct mlx5_klm *klm,
+				    uint32_t offset,
+				    uint32_t *remain)
+{
+	uint32_t data_len = (rte_pktmbuf_data_len(mbuf) - offset);
+	uintptr_t addr = rte_pktmbuf_mtod_offset(mbuf, uintptr_t, offset);
+
+	if (data_len > *remain)
+		data_len = *remain;
+	*remain -= data_len;
+	klm->byte_count = rte_cpu_to_be_32(data_len);
+	klm->address = rte_cpu_to_be_64(addr);
+	klm->mkey = mlx5_mr_mb2mr(&qp->mr_ctrl, mbuf);
+	return klm->mkey;
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_mbuf_chain_klms(struct mlx5_crypto_qp *qp,
+				      struct rte_crypto_op *op,
+				      struct rte_mbuf *mbuf,
+				      struct mlx5_klm *klm)
+{
+	uint32_t remain_len = op->sym->aead.data.length;
+	__rte_unused uint32_t nb_segs = mbuf->nb_segs;
+	uint32_t klm_n = 0;
+
+	/* mbuf seg num should be less than max_segs_num. */
+	MLX5_ASSERT(nb_segs <= qp->priv->max_segs_num);
+	/* First mbuf needs to take the data offset. */
+	if (unlikely(_mlx5_crypto_gcm_umr_build_mbuf_klm(qp, mbuf, klm,
+		     op->sym->aead.data.offset, &remain_len) == UINT32_MAX)) {
+		op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+		return 0;
+	}
+	klm++;
+	klm_n++;
+	while (remain_len) {
+		nb_segs--;
+		mbuf = mbuf->next;
+		MLX5_ASSERT(mbuf && nb_segs);
+		if (unlikely(_mlx5_crypto_gcm_umr_build_mbuf_klm(qp, mbuf, klm,
+						0, &remain_len) == UINT32_MAX)) {
+			op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+			return 0;
+		}
+		klm++;
+		klm_n++;
+	}
+	return klm_n;
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_klm_by_addr(struct mlx5_crypto_qp *qp,
+				  struct mlx5_klm *klm,
+				  void *addr,
+				  uint32_t len)
+{
+	klm->byte_count = rte_cpu_to_be_32(len);
+	klm->address = rte_cpu_to_be_64((uintptr_t)addr);
+	klm->mkey = mlx5_mr_addr2mr_bh(&qp->mr_ctrl, (uintptr_t)addr);
+	if (klm->mkey == UINT32_MAX)
+		return 0;
+	return 1;
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_op_klm(struct mlx5_crypto_qp *qp,
+			     struct rte_crypto_op *op,
+			     struct mlx5_crypto_gcm_op_info *op_info,
+			     struct mlx5_klm *klm,
+			     uint32_t *len)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct mlx5_klm *digest = NULL, *aad = NULL;
+	uint32_t total_len = op->sym->aead.data.length + sess->aad_len + sess->tag_len;
+	uint32_t klm_n = 0, klm_src = 0, klm_dst = 0;
+
+	/* Build AAD KLM. */
+	aad = klm;
+	if (!mlx5_crypto_gcm_build_klm_by_addr(qp, aad, op->sym->aead.aad.data, sess->aad_len))
+		return 0;
+	klm_n++;
+	/* Build src mubf KLM. */
+	klm_src = mlx5_crypto_gcm_build_mbuf_chain_klms(qp, op, op->sym->m_src, &klm[klm_n]);
+	if (!klm_src)
+		return 0;
+	klm_n += klm_src;
+	/* Reserve digest KLM if needed. */
+	if (!op_info->is_oop ||
+	    sess->op_type == MLX5_CRYPTO_OP_TYPE_DECRYPTION) {
+		digest = &klm[klm_n];
+		klm_n++;
+	}
+	/* Build dst mbuf KLM. */
+	if (op_info->is_oop) {
+		klm[klm_n] = *aad;
+		klm_n++;
+		klm_dst = mlx5_crypto_gcm_build_mbuf_chain_klms(qp, op, op->sym->m_dst,
+								&klm[klm_n]);
+		if (!klm_dst)
+			return 0;
+		klm_n += klm_dst;
+		total_len += (op->sym->aead.data.length + sess->aad_len);
+	}
+	/* Update digest at the end if it is not set. */
+	if (!digest) {
+		digest = &klm[klm_n];
+		klm_n++;
+	}
+	/* Build digest KLM. */
+	if (!mlx5_crypto_gcm_build_klm_by_addr(qp, digest, op->sym->aead.digest.data,
+					       sess->tag_len))
+		return 0;
+	*len = total_len;
+	return klm_n;
+}
+
+static __rte_always_inline struct mlx5_wqe_cseg *
+mlx5_crypto_gcm_get_umr_wqe(struct mlx5_crypto_qp *qp)
+{
+	uint32_t wqe_offset = qp->umr_pi & (qp->umr_wqbbs - 1);
+	uint32_t left_wqbbs = qp->umr_wqbbs - wqe_offset;
+	struct mlx5_wqe_cseg *wqe;
+
+	/* If UMR WQE is near the boundary. */
+	if (left_wqbbs < MLX5_UMR_GCM_WQE_STRIDE) {
+		/* Append NOP WQE as the left WQEBBS is not enough for UMR. */
+		wqe = RTE_PTR_ADD(qp->umr_qp_obj.umem_buf, wqe_offset * MLX5_SEND_WQE_BB);
+		wqe->opcode = rte_cpu_to_be_32(MLX5_OPCODE_NOP | ((uint32_t)qp->umr_pi << 8));
+		wqe->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | (left_wqbbs << 2));
+		wqe->flags = RTE_BE32(0);
+		wqe->misc = RTE_BE32(0);
+		qp->umr_pi += left_wqbbs;
+		wqe_offset = qp->umr_pi & (qp->umr_wqbbs - 1);
+	}
+	wqe_offset *= MLX5_SEND_WQE_BB;
+	return RTE_PTR_ADD(qp->umr_qp_obj.umem_buf, wqe_offset);
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_umr(struct mlx5_crypto_qp *qp,
+			  struct rte_crypto_op *op,
+			  uint32_t idx,
+			  struct mlx5_crypto_gcm_op_info *op_info,
+			  struct mlx5_crypto_gcm_data *data)
+{
+	struct mlx5_crypto_priv *priv = qp->priv;
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct mlx5_wqe_cseg *wqe;
+	struct mlx5_wqe_umr_cseg *ucseg;
+	struct mlx5_wqe_mkey_cseg *mkc;
+	struct mlx5_klm *iklm;
+	struct mlx5_klm *klm = &qp->klm_array[idx * priv->max_klm_num];
+	uint16_t klm_size, klm_align;
+	uint32_t total_len;
+
+	/* Build KLM base on the op. */
+	klm_size = mlx5_crypto_gcm_build_op_klm(qp, op, op_info, klm, &total_len);
+	if (!klm_size)
+		return -EINVAL;
+	klm_align = RTE_ALIGN(klm_size, 4);
+	/* Get UMR WQE memory. */
+	wqe = mlx5_crypto_gcm_get_umr_wqe(qp);
+	memset(wqe, 0, MLX5_UMR_GCM_WQE_SIZE);
+	/* Set WQE control seg. Non-inline KLM UMR WQE size must be 9 WQE_DS. */
+	wqe->opcode = rte_cpu_to_be_32(MLX5_OPCODE_UMR | ((uint32_t)qp->umr_pi << 8));
+	wqe->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | 9);
+	wqe->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET);
+	wqe->misc = rte_cpu_to_be_32(qp->mkey[idx]->id);
+	/* Set UMR WQE control seg. */
+	ucseg = (struct mlx5_wqe_umr_cseg *)(wqe + 1);
+	ucseg->mkey_mask |= RTE_BE64(1u << 0);
+	ucseg->ko_to_bs = rte_cpu_to_be_32(klm_align << MLX5_UMRC_KO_OFFSET);
+	/* Set mkey context seg. */
+	mkc = (struct mlx5_wqe_mkey_cseg *)(ucseg + 1);
+	mkc->len = rte_cpu_to_be_64(total_len);
+	mkc->qpn_mkey = rte_cpu_to_be_32(0xffffff00 | (qp->mkey[idx]->id & 0xff));
+	/* Set UMR pointer to data seg. */
+	iklm = (struct mlx5_klm *)(mkc + 1);
+	iklm->address = rte_cpu_to_be_64((uintptr_t)((char *)klm));
+	iklm->mkey = rte_cpu_to_be_32(qp->mr.lkey);
+	data->src_mkey = rte_cpu_to_be_32(qp->mkey[idx]->id);
+	data->dst_mkey = data->src_mkey;
+	data->src_addr = 0;
+	data->src_bytes = sess->aad_len + op->sym->aead.data.length;
+	data->dst_bytes = data->src_bytes;
+	if (op_info->is_enc)
+		data->dst_bytes += sess->tag_len;
+	else
+		data->src_bytes += sess->tag_len;
+	if (op_info->is_oop)
+		data->dst_addr = (void *)(uintptr_t)(data->src_bytes);
+	else
+		data->dst_addr = 0;
+	/* Clear the padding memory. */
+	memset(&klm[klm_size], 0, sizeof(struct mlx5_klm) * (klm_align - klm_size));
+	/* Update PI and WQE */
+	qp->umr_pi += MLX5_UMR_GCM_WQE_STRIDE;
+	qp->umr_wqe = (uint8_t *)wqe;
+	return 0;
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_build_send_en(struct mlx5_crypto_qp *qp)
+{
+	uint32_t wqe_offset = (qp->umr_pi & (qp->umr_wqbbs - 1)) * MLX5_SEND_WQE_BB;
+	struct mlx5_wqe_cseg *cs = RTE_PTR_ADD(qp->umr_qp_obj.wqes, wqe_offset);
+	struct mlx5_wqe_qseg *qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
+
+	cs->opcode = rte_cpu_to_be_32(MLX5_OPCODE_SEND_EN | ((uint32_t)qp->umr_pi << 8));
+	cs->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | 2);
+	/*
+	 * No need to generate the SEND_EN CQE as we want only GGA CQE
+	 * in the CQ normally. We can compare qp->last_send_gga_pi with
+	 * qp->pi to know if all SEND_EN be consumed.
+	 */
+	cs->flags = RTE_BE32((MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET) |
+			MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE);
+	cs->misc = RTE_BE32(0);
+	qs->max_index = rte_cpu_to_be_32(qp->pi);
+	qs->qpn_cqn = rte_cpu_to_be_32(qp->qp_obj.qp->id);
+	qp->umr_wqe = (uint8_t *)cs;
+	qp->umr_pi += 1;
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_wqe_set(struct mlx5_crypto_qp *qp,
+			struct rte_crypto_op *op,
+			uint32_t idx,
+			struct mlx5_crypto_gcm_data *data)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct mlx5_gga_wqe *wqe = &((struct mlx5_gga_wqe *)qp->qp_obj.wqes)[idx];
+	union mlx5_gga_crypto_opaque *opaq = qp->opaque_addr;
+
+	memcpy(opaq[idx].cp.iv,
+		rte_crypto_op_ctod_offset(op, uint8_t *, sess->iv_offset), sess->iv_len);
+	opaq[idx].cp.tag_size = sess->wqe_tag_len;
+	opaq[idx].cp.aad_size = sess->wqe_aad_len;
+	/* Update control seg. */
+	wqe->opcode = rte_cpu_to_be_32(MLX5_MMO_CRYPTO_OPC + (qp->pi << 8));
+	wqe->gga_ctrl1 = sess->mmo_ctrl;
+	wqe->gga_ctrl2 = sess->dek_id;
+	wqe->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET);
+	/* Update op_info seg. */
+	wqe->gather.bcount = rte_cpu_to_be_32(data->src_bytes);
+	wqe->gather.lkey = data->src_mkey;
+	wqe->gather.pbuf = rte_cpu_to_be_64((uintptr_t)data->src_addr);
+	/* Update output seg. */
+	wqe->scatter.bcount = rte_cpu_to_be_32(data->dst_bytes);
+	wqe->scatter.lkey = data->dst_mkey;
+	wqe->scatter.pbuf = rte_cpu_to_be_64((uintptr_t)data->dst_addr);
+	qp->wqe = (uint8_t *)wqe;
+}
+
+static uint16_t
+mlx5_crypto_gcm_enqueue_burst(void *queue_pair,
+			      struct rte_crypto_op **ops,
+			      uint16_t nb_ops)
+{
+	struct mlx5_crypto_qp *qp = queue_pair;
+	struct mlx5_crypto_session *sess;
+	struct mlx5_crypto_priv *priv = qp->priv;
+	struct mlx5_crypto_gcm_tag_cpy_info *tag;
+	struct mlx5_crypto_gcm_data gcm_data;
+	struct rte_crypto_op *op;
+	struct mlx5_crypto_gcm_op_info op_info;
+	uint16_t mask = qp->entries_n - 1;
+	uint16_t remain = qp->entries_n - (qp->pi - qp->qp_ci);
+	uint32_t idx;
+	uint16_t umr_cnt = 0;
+
+	if (remain < nb_ops)
+		nb_ops = remain;
+	else
+		remain = nb_ops;
+	if (unlikely(remain == 0))
+		return 0;
+	do {
+		op = *ops++;
+		sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+		idx = qp->pi & mask;
+		mlx5_crypto_gcm_get_op_info(qp, op, &op_info);
+		if (!op_info.need_umr) {
+			gcm_data.src_addr = op_info.src_addr;
+			gcm_data.src_bytes = op->sym->aead.data.length + sess->aad_len;
+			gcm_data.src_mkey = mlx5_mr_mb2mr(&qp->mr_ctrl, op->sym->m_src);
+			if (op_info.is_oop) {
+				gcm_data.dst_addr = RTE_PTR_SUB
+					(rte_pktmbuf_mtod_offset(op->sym->m_dst,
+					 void *, op->sym->aead.data.offset), sess->aad_len);
+				gcm_data.dst_mkey = mlx5_mr_mb2mr(&qp->mr_ctrl, op->sym->m_dst);
+			} else {
+				gcm_data.dst_addr = gcm_data.src_addr;
+				gcm_data.dst_mkey = gcm_data.src_mkey;
+			}
+			gcm_data.dst_bytes = gcm_data.src_bytes;
+			if (op_info.is_enc)
+				gcm_data.dst_bytes += sess->tag_len;
+			else
+				gcm_data.src_bytes += sess->tag_len;
+		} else {
+			if (unlikely(mlx5_crypto_gcm_build_umr(qp, op, idx,
+							&op_info, &gcm_data))) {
+				qp->stats.enqueue_err_count++;
+				if (remain != nb_ops) {
+					qp->stats.enqueued_count -= remain;
+					break;
+				}
+				return 0;
+			}
+			umr_cnt++;
+		}
+		mlx5_crypto_gcm_wqe_set(qp, op, idx, &gcm_data);
+		if (op_info.digest) {
+			tag = (struct mlx5_crypto_gcm_tag_cpy_info *)op->sym->aead.digest.data;
+			tag->digest = op_info.digest;
+			tag->tag_len = sess->tag_len;
+			op->status = MLX5_CRYPTO_OP_STATUS_GCM_TAG_COPY;
+		} else {
+			op->status = RTE_CRYPTO_OP_STATUS_SUCCESS;
+		}
+		qp->ops[idx] = op;
+		qp->pi++;
+	} while (--remain);
+	qp->stats.enqueued_count += nb_ops;
+	/* Update the last GGA cseg with COMP. */
+	((struct mlx5_wqe_cseg *)qp->wqe)->flags =
+		RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET);
+	/* Only when there are no pending SEND_EN WQEs in background. */
+	if (!umr_cnt && !qp->has_umr) {
+		mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->wqe,
+				   qp->pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
+				   !priv->uar.dbnc);
+	} else {
+		mlx5_crypto_gcm_build_send_en(qp);
+		mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->umr_wqe,
+				   qp->umr_pi, &qp->umr_qp_obj.db_rec[MLX5_SND_DBR],
+				   !priv->uar.dbnc);
+		qp->last_gga_pi = qp->pi;
+		qp->has_umr = true;
+	}
+	return nb_ops;
+}
+
+static __rte_noinline void
+mlx5_crypto_gcm_cqe_err_handle(struct mlx5_crypto_qp *qp, struct rte_crypto_op *op)
+{
+	uint8_t op_code;
+	const uint32_t idx = qp->cq_ci & (qp->entries_n - 1);
+	volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
+							&qp->cq_obj.cqes[idx];
+
+	op_code = rte_be_to_cpu_32(cqe->s_wqe_opcode_qpn) >> MLX5_CQ_INDEX_WIDTH;
+	DRV_LOG(ERR, "CQE ERR:0x%x, Vender_ERR:0x%x, OP:0x%x, QPN:0x%x, WQE_CNT:0x%x",
+		cqe->syndrome, cqe->vendor_err_synd, op_code,
+		(rte_be_to_cpu_32(cqe->s_wqe_opcode_qpn) & 0xffffff),
+		rte_be_to_cpu_16(cqe->wqe_counter));
+	if (op && op_code == MLX5_OPCODE_MMO) {
+		op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+		qp->stats.dequeue_err_count++;
+	}
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_fill_op(struct mlx5_crypto_qp *qp,
+			struct rte_crypto_op **ops,
+			uint16_t orci,
+			uint16_t rci,
+			uint16_t op_mask)
+{
+	uint16_t n;
+
+	orci &= op_mask;
+	rci &= op_mask;
+	if (unlikely(orci > rci)) {
+		n = op_mask - orci + 1;
+		memcpy(ops, &qp->ops[orci], n * sizeof(*ops));
+		orci = 0;
+	} else {
+		n = 0;
+	}
+	/* rci can be 0 here, memcpy will skip that. */
+	memcpy(&ops[n], &qp->ops[orci], (rci - orci) * sizeof(*ops));
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_cpy_tag(struct mlx5_crypto_qp *qp,
+			uint16_t orci,
+			uint16_t rci,
+			uint16_t op_mask)
+{
+	struct rte_crypto_op *op;
+	struct mlx5_crypto_gcm_tag_cpy_info *tag;
+
+	while (qp->cpy_tag_op && orci != rci) {
+		op = qp->ops[orci & op_mask];
+		if (op->status == MLX5_CRYPTO_OP_STATUS_GCM_TAG_COPY) {
+			tag = (struct mlx5_crypto_gcm_tag_cpy_info *)op->sym->aead.digest.data;
+			memcpy(op->sym->aead.digest.data, tag->digest, tag->tag_len);
+			op->status = RTE_CRYPTO_OP_STATUS_SUCCESS;
+			qp->cpy_tag_op--;
+		}
+		orci++;
+	}
+}
+
+static uint16_t
+mlx5_crypto_gcm_dequeue_burst(void *queue_pair,
+			      struct rte_crypto_op **ops,
+			      uint16_t nb_ops)
+{
+	struct mlx5_crypto_qp *qp = queue_pair;
+	volatile struct mlx5_cqe *restrict cqe;
+	const unsigned int cq_size = qp->cq_entries_n;
+	const unsigned int mask = cq_size - 1;
+	const unsigned int op_mask = qp->entries_n - 1;
+	uint32_t idx;
+	uint32_t next_idx = qp->cq_ci & mask;
+	uint16_t reported_ci = qp->reported_ci;
+	uint16_t qp_ci = qp->qp_ci;
+	const uint16_t max = RTE_MIN((uint16_t)(qp->pi - reported_ci), nb_ops);
+	uint16_t op_num = 0;
+	int ret;
+
+	if (unlikely(max == 0))
+		return 0;
+	while (qp_ci - reported_ci < max) {
+		idx = next_idx;
+		next_idx = (qp->cq_ci + 1) & mask;
+		cqe = &qp->cq_obj.cqes[idx];
+		ret = check_cqe(cqe, cq_size, qp->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (unlikely(ret != MLX5_CQE_STATUS_HW_OWN))
+				mlx5_crypto_gcm_cqe_err_handle(qp,
+						qp->ops[reported_ci & op_mask]);
+			break;
+		}
+		qp_ci = rte_be_to_cpu_16(cqe->wqe_counter) + 1;
+		if (qp->has_umr &&
+		    (qp->last_gga_pi + 1) == qp_ci)
+			qp->has_umr = false;
+		qp->cq_ci++;
+	}
+	/* If wqe_counter changed, means CQE handled. */
+	if (likely(qp->qp_ci != qp_ci)) {
+		qp->qp_ci = qp_ci;
+		rte_io_wmb();
+		qp->cq_obj.db_rec[0] = rte_cpu_to_be_32(qp->cq_ci);
+	}
+	/* If reported_ci is not same with qp_ci, means op retrieved. */
+	if (qp_ci != reported_ci) {
+		op_num = RTE_MIN((uint16_t)(qp_ci - reported_ci), max);
+		reported_ci += op_num;
+		mlx5_crypto_gcm_cpy_tag(qp, qp->reported_ci, reported_ci, op_mask);
+		mlx5_crypto_gcm_fill_op(qp, ops, qp->reported_ci, reported_ci, op_mask);
+		qp->stats.dequeued_count += op_num;
+		qp->reported_ci = reported_ci;
+	}
+	return op_num;
+}
+
 int
 mlx5_crypto_gcm_init(struct mlx5_crypto_priv *priv)
 {
@@ -337,6 +923,8 @@  mlx5_crypto_gcm_init(struct mlx5_crypto_priv *priv)
 	mlx5_os_set_reg_mr_cb(&priv->reg_mr_cb, &priv->dereg_mr_cb);
 	dev_ops->queue_pair_setup = mlx5_crypto_gcm_qp_setup;
 	dev_ops->queue_pair_release = mlx5_crypto_gcm_qp_release;
+	crypto_dev->dequeue_burst = mlx5_crypto_gcm_dequeue_burst;
+	crypto_dev->enqueue_burst = mlx5_crypto_gcm_enqueue_burst;
 	priv->max_klm_num = RTE_ALIGN((priv->max_segs_num + 1) * 2 + 1, MLX5_UMR_KLM_NUM_ALIGN);
 	priv->caps = mlx5_crypto_gcm_caps;
 	return 0;