[v5,2/5] raw/afu_mf: add N3000 AFU driver

Message ID 1653629824-4535-3-git-send-email-wei.huang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series introduce afu_mf raw device driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Wei Huang May 27, 2022, 5:37 a.m. UTC
  N3000 AFU includes NLB0 and DMA modules, NLB0 is used to test PCI bus
and DMA is used to test local memory.
This driver initialize the modules and report test result.

Signed-off-by: Wei Huang <wei.huang@intel.com>
---
 drivers/raw/afu_mf/afu_mf_rawdev.c |    4 +
 drivers/raw/afu_mf/afu_mf_rawdev.h |   18 +
 drivers/raw/afu_mf/meson.build     |    4 +-
 drivers/raw/afu_mf/n3000_afu.c     | 2005 ++++++++++++++++++++++++++++++++++++
 drivers/raw/afu_mf/n3000_afu.h     |  333 ++++++
 drivers/raw/afu_mf/rte_pmd_afu.h   |   97 ++
 6 files changed, 2460 insertions(+), 1 deletion(-)
 create mode 100644 drivers/raw/afu_mf/n3000_afu.c
 create mode 100644 drivers/raw/afu_mf/n3000_afu.h
 create mode 100644 drivers/raw/afu_mf/rte_pmd_afu.h
  

Comments

Zhang, Tianfei June 6, 2022, 1:38 a.m. UTC | #1
> -----Original Message-----
> From: Huang, Wei <wei.huang@intel.com>
> Sent: Friday, May 27, 2022 1:37 PM
> To: dev@dpdk.org; thomas@monjalon.net; nipun.gupta@nxp.com;
> hemant.agrawal@nxp.com
> Cc: stable@dpdk.org; Xu, Rosen <rosen.xu@intel.com>; Zhang, Tianfei
> <tianfei.zhang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Huang, Wei
> <wei.huang@intel.com>
> Subject: [PATCH v5 2/5] raw/afu_mf: add N3000 AFU driver
> 
> N3000 AFU includes NLB0 and DMA modules, NLB0 is used to test PCI bus
> and DMA is used to test local memory.
> This driver initialize the modules and report test result.
> 
> Signed-off-by: Wei Huang <wei.huang@intel.com>
> ---
>  drivers/raw/afu_mf/afu_mf_rawdev.c |    4 +
>  drivers/raw/afu_mf/afu_mf_rawdev.h |   18 +
>  drivers/raw/afu_mf/meson.build     |    4 +-
>  drivers/raw/afu_mf/n3000_afu.c     | 2005
> ++++++++++++++++++++++++++++++++++++
>  drivers/raw/afu_mf/n3000_afu.h     |  333 ++++++
>  drivers/raw/afu_mf/rte_pmd_afu.h   |   97 ++
>  6 files changed, 2460 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/raw/afu_mf/n3000_afu.c
>  create mode 100644 drivers/raw/afu_mf/n3000_afu.h
>  create mode 100644 drivers/raw/afu_mf/rte_pmd_afu.h
> 
> diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.c
> b/drivers/raw/afu_mf/afu_mf_rawdev.c
> index 5be372a..7c18f3b 100644
> --- a/drivers/raw/afu_mf/afu_mf_rawdev.c
> +++ b/drivers/raw/afu_mf/afu_mf_rawdev.c
> @@ -17,15 +17,19 @@
>  #include <rte_memzone.h>
>  #include <rte_rawdev_pmd.h>
> 
> +#include "rte_pmd_afu.h"
>  #include "afu_mf_rawdev.h"
> +#include "n3000_afu.h"
> 
>  #define AFU_MF_PMD_RAWDEV_NAME rawdev_afu_mf
> 
>  static const struct rte_afu_uuid afu_uuid_map[] = {
> +	{ N3000_AFU_UUID_L, N3000_AFU_UUID_H },
>  	{ 0, 0 /* sentinel */ }
>  };
> 
>  static struct afu_mf_drv *afu_table[] = {
> +	&n3000_afu_drv,
>  	NULL
>  };
> 
> diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.h
> b/drivers/raw/afu_mf/afu_mf_rawdev.h
> index df6715c..5a66f6c 100644
> --- a/drivers/raw/afu_mf/afu_mf_rawdev.h
> +++ b/drivers/raw/afu_mf/afu_mf_rawdev.h
> @@ -30,6 +30,24 @@
>  #define AFU_MF_PMD_WARN(fmt, args...) \
>  	AFU_MF_PMD_LOG(WARNING, fmt, ## args)
> 
> +#define CLS_TO_SIZE(n)  ((n) << 6)  /* get size of n cache lines */
> +#define SIZE_TO_CLS(s)  ((s) >> 6)  /* convert size to number of cache lines */
> +#define MHZ(f)  ((f) * 1000000)
> +
> +#define dsm_poll_timeout(addr, val, cond, invl, timeout) \
> +({                                                       \
> +	uint64_t __wait = 0;                                 \
> +	uint64_t __invl = (invl);                            \
> +	uint64_t __timeout = (timeout);                      \
> +	for (; __wait <= __timeout; __wait += __invl) {      \
> +		(val) = *(addr);                                 \
> +		if (cond)                                        \
> +			break;                                       \
> +		rte_delay_ms(__invl);                            \
> +	}                                                    \
> +	(cond) ? 0 : 1;                                      \
> +})

Dsm means DMA?

> +
>  struct afu_mf_rawdev;
> 
>  struct afu_mf_ops {
> diff --git a/drivers/raw/afu_mf/meson.build b/drivers/raw/afu_mf/meson.build
> index 80526a2..8a989e3 100644
> --- a/drivers/raw/afu_mf/meson.build
> +++ b/drivers/raw/afu_mf/meson.build
> @@ -2,4 +2,6 @@
>  # Copyright 2022 Intel Corporation
> 
>  deps += ['rawdev', 'bus_pci', 'bus_ifpga']
> -sources = files('afu_mf_rawdev.c')
> +sources = files('afu_mf_rawdev.c', 'n3000_afu.c')
> +
> +headers = files('rte_pmd_afu.h')
> diff --git a/drivers/raw/afu_mf/n3000_afu.c b/drivers/raw/afu_mf/n3000_afu.c
> new file mode 100644
> index 0000000..19d7c54
> --- /dev/null
> +++ b/drivers/raw/afu_mf/n3000_afu.c
> @@ -0,0 +1,2005 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <inttypes.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <poll.h>
> +#include <sys/eventfd.h>
> +#include <sys/ioctl.h>
> +
> +#include <rte_eal.h>
> +#include <rte_malloc.h>
> +#include <rte_memcpy.h>
> +#include <rte_io.h>
> +#include <rte_vfio.h>
> +#include <rte_bus_pci.h>
> +#include <rte_bus_ifpga.h>
> +#include <rte_rawdev.h>
> +
> +#include "afu_mf_rawdev.h"
> +#include "n3000_afu.h"
> +
> +static int nlb_afu_config(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> +	struct nlb_csr_cfg v;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	if (!dev->priv)
> +		return -ENOENT;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	cfg = &priv->nlb_cfg;
> +
> +	v.csr = 0;
> +
> +	if (cfg->cont)
> +		v.cont = 1;
> +
> +	if (cfg->cache_policy == NLB_WRPUSH_I)
> +		v.wrpush_i = 1;
> +	else
> +		v.wrthru_en = cfg->cache_policy;
> +
> +	if (cfg->cache_hint == NLB_RDLINE_MIXED)
> +		v.rdsel = 3;
> +	else
> +		v.rdsel = cfg->cache_hint;
> +
> +	v.mode = cfg->mode;
> +	v.chsel = cfg->read_vc;
> +	v.wr_chsel = cfg->write_vc;
> +	v.wrfence_chsel = cfg->wrfence_vc;
> +	v.wrthru_en = cfg->cache_policy;
> +	v.multicl_len = cfg->multi_cl - 1;
> +
> +	AFU_MF_PMD_DEBUG("cfg: 0x%08x", v.csr);
> +	rte_write32(v.csr, priv->nlb_ctx.addr + CSR_CFG);
> +
> +	return 0;
> +}
> +
> +static void nlb_afu_report(struct afu_mf_rawdev *dev, uint32_t cl)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> +	struct nlb_dsm_status *stat = NULL;
> +	uint64_t ticks = 0;
> +	double num, rd_bw, wr_bw;
> +
> +	if (!dev || !dev->priv)
> +		return;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +
> +	cfg = &priv->nlb_cfg;
> +	stat = priv->nlb_ctx.status_ptr;
> +
> +	if (cfg->cont)
> +		ticks = stat->num_clocks - stat->start_overhead;
> +	else
> +		ticks = stat->num_clocks -
> +			(stat->start_overhead + stat->end_overhead);
> +
> +	if (cfg->freq_mhz == 0)
> +		cfg->freq_mhz = 200;
> +
> +	num = (double)stat->num_reads;
> +	rd_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
> +	num = (double)stat->num_writes;
> +	wr_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
> +
> +	printf("Cachelines  Read_Count Write_Count Clocks@%uMHz   "
> +		"Rd_Bandwidth   Wr_Bandwidth\n", cfg->freq_mhz);
> +	printf("%10u  %10u %11u  %12"PRIu64"   %7.3f GB/s   %7.3f GB/s\n",
> +		cl, stat->num_reads, stat->num_writes, ticks,
> +		rd_bw / 1e9, wr_bw / 1e9);
> +}
> +
> +static int nlb_afu_test(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct nlb_afu_ctx *ctx = NULL;
> +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> +	struct nlb_csr_ctl ctl;
> +	uint32_t *ptr = NULL;
> +	uint32_t i, j, cl, val = 0;
> +	uint64_t sval = 0;
> +	int ret = 0;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	if (!dev->priv)
> +		return -ENOENT;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	ctx = &priv->nlb_ctx;
> +	cfg = &priv->nlb_cfg;
> +
> +	/* initialize registers */
> +	AFU_MF_PMD_DEBUG("dsm_addr: 0x%"PRIx64, ctx->dsm_iova);
> +	rte_write64(ctx->dsm_iova, ctx->addr + CSR_AFU_DSM_BASEL);
> +
> +	ctl.csr = 0;
> +	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +	ctl.reset = 1;
> +	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +
> +	AFU_MF_PMD_DEBUG("src_addr: 0x%"PRIx64, ctx->src_iova);
> +	rte_write64(SIZE_TO_CLS(ctx->src_iova), ctx->addr + CSR_SRC_ADDR);
> +	AFU_MF_PMD_DEBUG("dst_addr: 0x%"PRIx64, ctx->dest_iova);
> +	rte_write64(SIZE_TO_CLS(ctx->dest_iova), ctx->addr + CSR_DST_ADDR);
> +
> +	ret = nlb_afu_config(dev);
> +	if (ret)
> +		return ret;
> +
> +	/* initialize src data */
> +	ptr = (uint32_t *)ctx->src_ptr;
> +	j = CLS_TO_SIZE(cfg->end) >> 2;
> +	for (i = 0; i < j; i++)
> +		*ptr++ = i;
> +
> +	/* start test */
> +	for (cl = cfg->begin; cl <= cfg->end; cl += cfg->multi_cl) {
> +		memset(ctx->dest_ptr, 0, CLS_TO_SIZE(cl));
> +		memset(ctx->dsm_ptr, 0, DSM_SIZE);
> +
> +		ctl.csr = 0;
> +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +		ctl.reset = 1;
> +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +
> +		rte_write32(cl, ctx->addr + CSR_NUM_LINES);
> +
> +		rte_delay_us(10);
> +
> +		ctl.start = 1;
> +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +
> +		if (cfg->cont) {
> +			rte_delay_ms(cfg->timeout * 1000);
> +			ctl.force_completion = 1;
> +			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +			ret = dsm_poll_timeout(&ctx->status_ptr-
> >test_complete,
> +				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
> +				DSM_TIMEOUT);
> +			if (ret) {
> +				printf("DSM poll timeout\n");
> +				goto end;
> +			}
> +		} else {
> +			ret = dsm_poll_timeout(&ctx->status_ptr-
> >test_complete,
> +				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
> +				DSM_TIMEOUT);
> +			if (ret) {
> +				printf("DSM poll timeout\n");
> +				goto end;
> +			}
> +			ctl.force_completion = 1;
> +			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> +		}
> +
> +		nlb_afu_report(dev, cl);
> +
> +		i = 0;
> +		while (i++ < 100) {
> +			sval = rte_read64(ctx->addr + CSR_STATUS1);
> +			if (sval == 0)
> +				break;
> +			rte_delay_us(1000);
> +		}
> +
> +		ptr = (uint32_t *)ctx->dest_ptr;
> +		j = CLS_TO_SIZE(cl) >> 2;
> +		for (i = 0; i < j; i++) {
> +			if (*ptr++ != i) {
> +				AFU_MF_PMD_ERR("Data mismatch @ %u", i);
> +				break;
> +			}
> +		}
> +	}
> +
> +end:
> +	return ret;
> +}
> +
> +static void dma_afu_buf_free(struct dma_afu_ctx *ctx)
> +{
> +	int i = 0;
> +
> +	if (!ctx)
> +		return;
> +
> +	for (i = 0; i < NUM_DMA_BUF; i++) {
> +		rte_free(ctx->dma_buf[i]);
> +		ctx->dma_buf[i] = NULL;
> +	}
> +
> +	rte_free(ctx->data_buf);
> +	ctx->data_buf = NULL;
> +
> +	rte_free(ctx->ref_buf);
> +	ctx->ref_buf = NULL;
> +}
> +
> +static int dma_afu_buf_alloc(struct dma_afu_ctx *ctx,
> +	struct rte_pmd_afu_dma_cfg *cfg)
> +{
> +	size_t page_sz = sysconf(_SC_PAGE_SIZE);
> +	int i, ret = 0;
> +
> +	if (!ctx || !cfg)
> +		return -EINVAL;
> +
> +	for (i = 0; i < NUM_DMA_BUF; i++) {
> +		ctx->dma_buf[i] = (uint64_t *)rte_zmalloc(NULL, cfg->size,
> +			TEST_MEM_ALIGN);
> +		if (!ctx->dma_buf[i]) {
> +			ret = -ENOMEM;
> +			goto free;
> +		}
> +		ctx->dma_iova[i] = rte_malloc_virt2iova(ctx->dma_buf[i]);
> +		if (ctx->dma_iova[i] == RTE_BAD_IOVA) {
> +			ret = -ENOMEM;
> +			goto free;
> +		}
> +	}
> +
> +	ctx->data_buf = rte_malloc(NULL, cfg->length, page_sz);
> +	if (!ctx->data_buf) {
> +		ret = -ENOMEM;
> +		goto free;
> +	}
> +
> +	ctx->ref_buf = rte_malloc(NULL, cfg->length, page_sz);
> +	if (!ctx->ref_buf) {
> +		ret = -ENOMEM;
> +		goto free;
> +	}

Suppose that If ctx->ref_buf alloc fail, the dma_afu_buf_free() will work correct?

> +
> +	return 0;
> +
> +free:
> +	dma_afu_buf_free(ctx);
> +	return ret;
> +}
> +
> +static void dma_afu_buf_init(struct dma_afu_ctx *ctx, size_t size)
> +{
> +	int *ptr = NULL;
> +	size_t i = 0;
> +	size_t dword_size = 0;
> +
> +	if (!ctx || !size)
> +		return;
> +
> +	ptr = (int *)ctx->ref_buf;
> +
> +	if (ctx->pattern) {
> +		memset(ptr, ctx->pattern, size);
> +	} else {
> +		srand(99);
> +		dword_size = size >> 2;
> +		for (i = 0; i < dword_size; i++)
> +			*ptr++ = rand();
> +	}
> +	rte_memcpy(ctx->data_buf, ctx->ref_buf, size);
> +}
> +
> +static int dma_afu_buf_verify(struct dma_afu_ctx *ctx, size_t size)
> +{
> +	uint8_t *src = NULL;
> +	uint8_t *dst = NULL;
> +	size_t i = 0;
> +	int n = 0;
> +
> +	if (!ctx || !size)
> +		return -EINVAL;
> +
> +	src = (uint8_t *)ctx->ref_buf;
> +	dst = (uint8_t *)ctx->data_buf;
> +
> +	if (memcmp(src, dst, size)) {
> +		printf("Transfer is corrupted\n");
> +		if (ctx->verbose) {
> +			for (i = 0; i < size; i++) {
> +				if (*src != *dst) {
> +					if (++n >= ERR_CHECK_LIMIT)
> +						break;
> +					printf("Mismatch at 0x%zx, "
> +						"Expected %02x  Actual
> %02x\n",
> +						i, *src, *dst);
> +				}
> +				src++;
> +				dst++;
> +			}
> +			if (n < ERR_CHECK_LIMIT) {
> +				printf("Found %d error bytes\n", n);
> +			} else {
> +				printf("......\n");
> +				printf("Found more than %d error bytes\n", n);
> +			}
> +		}
> +		return -1;
> +	}
> +
> +	printf("Transfer is verified\n");
> +	return 0;
> +}
> +
> +static void blk_write64(uint64_t *dev_addr, uint64_t *host_addr, uint64_t
> bytes)
> +{
> +	uint64_t qwords = bytes / sizeof(uint64_t);
> +
> +	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
> +		!IS_ALIGNED_QWORD((uint64_t)bytes))
> +		return;
> +
> +	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
> +		rte_write64(*host_addr, dev_addr);
> +}
> +
> +static void blk_read64(uint64_t *dev_addr, uint64_t *host_addr, uint64_t
> bytes)
> +{
> +	uint64_t qwords = bytes / sizeof(uint64_t);
> +
> +	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
> +		!IS_ALIGNED_QWORD((uint64_t)bytes))
> +		return;
> +
> +	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
> +		*host_addr = rte_read64(dev_addr);
> +}
> +
> +static void switch_ase_page(struct dma_afu_ctx *ctx, uint64_t addr)
> +{
> +	uint64_t requested_page = addr & ~DMA_ASE_WINDOW_MASK;
> +
> +	if (!ctx)
> +		return;
> +
> +	if (requested_page != ctx->cur_ase_page) {
> +		rte_write64(requested_page, ctx->ase_ctrl_addr);
> +		ctx->cur_ase_page = requested_page;
> +	}
> +}
> +
> +static int ase_write_unaligned(struct dma_afu_ctx *ctx, uint64_t dev_addr,
> +	uint64_t host_addr, uint32_t count)
> +{
> +	uint64_t dev_aligned_addr = 0;
> +	uint64_t shift = 0;
> +	uint64_t val = 0;
> +	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%x)",
> host_addr,
> +		dev_addr, count);
> +
> +	if (!ctx || (count >= QWORD_BYTES))
> +		return -EINVAL;
> +
> +	if (!count)
> +		return 0;
> +
> +	switch_ase_page(ctx, dev_addr);
> +
> +	shift = dev_addr % QWORD_BYTES;
> +	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
> +	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
> +	rte_memcpy(((char *)(&val)) + shift, (void *)addr, count);
> +
> +	/* write back to device */
> +	rte_write64(val, ctx->ase_data_addr + dev_aligned_addr);
> +
> +	return 0;
> +}
> +
> +static int ase_write(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
> +	uint64_t *src_ptr, uint64_t *count)
> +{
> +	uint64_t src = *src_ptr;
> +	uint64_t dst = *dst_ptr;
> +	uint64_t align_bytes = *count;
> +	uint64_t offset = 0;
> +	uint64_t left_in_page = DMA_ASE_WINDOW;
> +	uint64_t size_to_copy = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")",
> src, dst,
> +		align_bytes);
> +
> +	if (!ctx || !IS_ALIGNED_DWORD(dst))
> +		return -EINVAL;
> +
> +	if (align_bytes < DWORD_BYTES)
> +		return 0;
> +
> +	if (!IS_ALIGNED_QWORD(dst)) {
> +		/* Write out a single DWORD to get QWORD aligned */
> +		switch_ase_page(ctx, dst);
> +		offset = dst & DMA_ASE_WINDOW_MASK;
> +
> +		rte_write32(*(uint32_t *)(uintptr_t)src,
> +			ctx->ase_data_addr + offset);
> +		src += DWORD_BYTES;
> +		dst += DWORD_BYTES;
> +		align_bytes -= DWORD_BYTES;
> +	}
> +
> +	if (!align_bytes)
> +		return 0;
> +
> +	/* Write out blocks of 64-bit values */
> +	while (align_bytes >= QWORD_BYTES) {
> +		left_in_page -= dst & DMA_ASE_WINDOW_MASK;
> +		size_to_copy =
> +			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES -
> 1)));
> +		if (size_to_copy < QWORD_BYTES)
> +			break;
> +		switch_ase_page(ctx, dst);
> +		offset = dst & DMA_ASE_WINDOW_MASK;
> +		blk_write64((uint64_t *)(ctx->ase_data_addr + offset),
> +			(uint64_t *)(uintptr_t)src, size_to_copy);
> +		src += size_to_copy;
> +		dst += size_to_copy;
> +		align_bytes -= size_to_copy;
> +	}
> +
> +	if (align_bytes >= DWORD_BYTES) {
> +		/* Write out remaining DWORD */
> +		switch_ase_page(ctx, dst);
> +		offset = dst & DMA_ASE_WINDOW_MASK;
> +		rte_write32(*(uint32_t *)(uintptr_t)src,
> +			ctx->ase_data_addr + offset);
> +		src += DWORD_BYTES;
> +		dst += DWORD_BYTES;
> +		align_bytes -= DWORD_BYTES;
> +	}
> +
> +	*src_ptr = src;
> +	*dst_ptr = dst;
> +	*count = align_bytes;
> +
> +	return 0;
> +}
> +
> +static int ase_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
> +	uint64_t *src_ptr, uint64_t count)
> +{
> +	uint64_t dst = *dst_ptr;
> +	uint64_t src = *src_ptr;
> +	uint64_t count_left = count;
> +	uint64_t unaligned_size = 0;
> +	int ret = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")",
> src, dst,
> +		count);
> +
> +	/* aligns address to 8 byte using dst masking method */
> +	if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
> +		unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
> +		if (unaligned_size > count_left)
> +			unaligned_size = count_left;
> +		ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
> +		if (ret)
> +			return ret;
> +		count_left -= unaligned_size;
> +		src += unaligned_size;
> +		dst += unaligned_size;
> +	}
> +
> +	/* Handles 8/4 byte MMIO transfer */
> +	ret = ase_write(ctx, &dst, &src, &count_left);
> +	if (ret)
> +		return ret;
> +
> +	/* Left over unaligned bytes transferred using dst masking method */
> +	unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
> +	if (unaligned_size > count_left)
> +		unaligned_size = count_left;
> +
> +	ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
> +	if (ret)
> +		return ret;
> +
> +	count_left -= unaligned_size;
> +	*dst_ptr = dst + unaligned_size;
> +	*src_ptr = src + unaligned_size;
> +
> +	return 0;
> +}
> +
> +static int ase_read_unaligned(struct dma_afu_ctx *ctx, uint64_t dev_addr,
> +	uint64_t host_addr, uint32_t count)
> +{
> +	uint64_t dev_aligned_addr = 0;
> +	uint64_t shift = 0;
> +	uint64_t val = 0;
> +	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64" (0x%x)",
> host_addr,
> +		dev_addr, count);
> +
> +	if (!ctx || (count >= QWORD_BYTES))
> +		return -EINVAL;
> +
> +	if (!count)
> +		return 0;
> +
> +	switch_ase_page(ctx, dev_addr);
> +
> +	shift = dev_addr % QWORD_BYTES;
> +	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
> +	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
> +	rte_memcpy((void *)addr, ((char *)(&val)) + shift, count);
> +
> +	return 0;
> +}
> +
> +static int ase_read(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
> +	uint64_t *dst_ptr, uint64_t *count)
> +{
> +	uint64_t src = *src_ptr;
> +	uint64_t dst = *dst_ptr;
> +	uint64_t align_bytes = *count;
> +	uint64_t offset = 0;
> +	uint64_t left_in_page = DMA_ASE_WINDOW;
> +	uint64_t size_to_copy = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64" (0x%"PRIx64")",
> dst, src,
> +		align_bytes);
> +
> +	if (!ctx || !IS_ALIGNED_DWORD(src))
> +		return -EINVAL;
> +
> +	if (align_bytes < DWORD_BYTES)
> +		return 0;
> +
> +	if (!IS_ALIGNED_QWORD(src)) {
> +		/* Read a single DWORD to get QWORD aligned */
> +		switch_ase_page(ctx, src);
> +		offset = src & DMA_ASE_WINDOW_MASK;
> +		*(uint32_t *)(uintptr_t)dst =
> +			rte_read32(ctx->ase_data_addr + offset);
> +		src += DWORD_BYTES;
> +		dst += DWORD_BYTES;
> +		align_bytes -= DWORD_BYTES;
> +	}
> +
> +	if (!align_bytes)
> +		return 0;
> +
> +	/* Read blocks of 64-bit values */
> +	while (align_bytes >= QWORD_BYTES) {
> +		left_in_page -= src & DMA_ASE_WINDOW_MASK;
> +		size_to_copy =
> +			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES -
> 1)));
> +		if (size_to_copy < QWORD_BYTES)
> +			break;
> +		switch_ase_page(ctx, src);
> +		offset = src & DMA_ASE_WINDOW_MASK;
> +		blk_read64((uint64_t *)(ctx->ase_data_addr + offset),
> +			(uint64_t *)(uintptr_t)dst, size_to_copy);
> +		src += size_to_copy;
> +		dst += size_to_copy;
> +		align_bytes -= size_to_copy;
> +	}
> +
> +	if (align_bytes >= DWORD_BYTES) {
> +		/* Read remaining DWORD */
> +		switch_ase_page(ctx, src);
> +		offset = src & DMA_ASE_WINDOW_MASK;
> +		*(uint32_t *)(uintptr_t)dst =
> +			rte_read32(ctx->ase_data_addr + offset);
> +		src += DWORD_BYTES;
> +		dst += DWORD_BYTES;
> +		align_bytes -= DWORD_BYTES;
> +	}
> +
> +	*src_ptr = src;
> +	*dst_ptr = dst;
> +	*count = align_bytes;
> +
> +	return 0;
> +}
> +
> +static int ase_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
> +	uint64_t *dst_ptr, uint64_t count)
> +{
> +	uint64_t src = *src_ptr;
> +	uint64_t dst = *dst_ptr;
> +	uint64_t count_left = count;
> +	uint64_t unaligned_size = 0;
> +	int ret = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")",
> src, dst,
> +		count);
> +
> +	/* Aligns address to 8 byte using src masking method */
> +	if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
> +		unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
> +		if (unaligned_size > count_left)
> +			unaligned_size = count_left;
> +		ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
> +		if (ret)
> +			return ret;
> +		count_left -= unaligned_size;
> +		dst += unaligned_size;
> +		src += unaligned_size;
> +	}
> +
> +	/* Handles 8/4 byte MMIO transfer */
> +	ret = ase_read(ctx, &src, &dst, &count_left);
> +	if (ret)
> +		return ret;
> +
> +	/* Left over unaligned bytes transferred using src masking method */
> +	unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
> +	if (unaligned_size > count_left)
> +		unaligned_size = count_left;
> +
> +	ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
> +	if (ret)
> +		return ret;
> +
> +	count_left -= unaligned_size;
> +	*dst_ptr = dst + unaligned_size;
> +	*src_ptr = src + unaligned_size;
> +
> +	return 0;
> +}
> +
> +static void clear_interrupt(struct dma_afu_ctx *ctx)
> +{
> +	/* clear interrupt by writing 1 to IRQ bit in status register */
> +	msgdma_status status;
> +
> +	if (!ctx)
> +		return;
> +
> +	status.csr = 0;
> +	status.irq = 1;
> +	rte_write32(status.csr, CSR_STATUS(ctx->csr_addr));
> +}
> +
> +static int poll_interrupt(struct dma_afu_ctx *ctx)
> +{
> +	struct pollfd pfd = {0};
> +	uint64_t count = 0;
> +	ssize_t bytes_read = 0;
> +	int poll_ret = 0;
> +	int ret = 0;
> +
> +	if (!ctx || (ctx->event_fd < 0))
> +		return -EINVAL;
> +
> +	pfd.fd = ctx->event_fd;
> +	pfd.events = POLLIN;
> +	poll_ret = poll(&pfd, 1, DMA_TIMEOUT_MSEC);
> +	if (poll_ret < 0) {
> +		AFU_MF_PMD_ERR("Error %s", strerror(errno));
> +		ret = -EFAULT;
> +		goto out;
> +	} else if (poll_ret == 0) {
> +		AFU_MF_PMD_ERR("Timeout");
> +		ret = -ETIMEDOUT;
> +	} else {
> +		bytes_read = read(pfd.fd, &count, sizeof(count));
> +		if (bytes_read > 0) {
> +			if (ctx->verbose)
> +				AFU_MF_PMD_DEBUG("Successful, ret %d, cnt
> %"PRIu64,
> +					poll_ret, count);
> +			ret = 0;
> +		} else {
> +			AFU_MF_PMD_ERR("Failed %s", bytes_read > 0 ?
> +				strerror(errno) : "zero bytes read");
> +			ret = -EIO;
> +		}
> +	}
> +out:
> +	clear_interrupt(ctx);
> +	return ret;
> +}
> +
> +static void send_descriptor(struct dma_afu_ctx *ctx, msgdma_ext_desc *desc)
> +{
> +	msgdma_status status;
> +	uint64_t fpga_queue_full = 0;
> +
> +	if (!ctx)
> +		return;
> +
> +	if (ctx->verbose) {
> +		AFU_MF_PMD_DEBUG("descriptor.rd_address = 0x%x%08x",
> +			desc->rd_address_ext, desc->rd_address);
> +		AFU_MF_PMD_DEBUG("descriptor.wr_address = 0x%x%08x",
> +			desc->wr_address_ext, desc->wr_address);
> +		AFU_MF_PMD_DEBUG("descriptor.len = %u", desc->len);
> +		AFU_MF_PMD_DEBUG("descriptor.wr_burst_count = %u",
> +			desc->wr_burst_count);
> +		AFU_MF_PMD_DEBUG("descriptor.rd_burst_count = %u",
> +			desc->rd_burst_count);
> +		AFU_MF_PMD_DEBUG("descriptor.wr_stride %u", desc-
> >wr_stride);
> +		AFU_MF_PMD_DEBUG("descriptor.rd_stride %u", desc-
> >rd_stride);
> +	}
> +
> +	do {
> +		status.csr = rte_read32(CSR_STATUS(ctx->csr_addr));
> +		if (fpga_queue_full++ > 100000000) {
> +			AFU_MF_PMD_DEBUG("DMA queue full retry");
> +			fpga_queue_full = 0;
> +		}
> +	} while (status.desc_buf_full);
> +
> +	blk_write64((uint64_t *)ctx->desc_addr, (uint64_t *)desc,
> +		sizeof(*desc));
> +}
> +
> +static int do_dma(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> +	int count, int is_last_desc, fpga_dma_type type, int intr_en)
> +{
> +	msgdma_ext_desc *desc = NULL;
> +	int alignment_offset = 0;
> +	int segment_size = 0;
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	/* src, dst and count must be 64-byte aligned */
> +	if (!IS_DMA_ALIGNED(src) || !IS_DMA_ALIGNED(dst) ||
> +		!IS_DMA_ALIGNED(count))
> +		return -EINVAL;
> +	memset(ctx->desc_buf, 0, sizeof(msgdma_ext_desc));
> +
> +	/* these fields are fixed for all DMA transfers */
> +	desc = ctx->desc_buf;
> +	desc->seq_num = 0;
> +	desc->wr_stride = 1;
> +	desc->rd_stride = 1;
> +	desc->control.go = 1;
> +	if (intr_en)
> +		desc->control.transfer_irq_en = 1;
> +	else
> +		desc->control.transfer_irq_en = 0;
> +
> +	if (!is_last_desc)
> +		desc->control.early_done_en = 1;
> +	else
> +		desc->control.early_done_en = 0;
> +
> +	if (type == FPGA_TO_FPGA) {
> +		desc->rd_address = src & DMA_MASK_32_BIT;
> +		desc->wr_address = dst & DMA_MASK_32_BIT;
> +		desc->len = count;
> +		desc->wr_burst_count = 4;
> +		desc->rd_burst_count = 4;
> +		desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
> +		desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
> +		send_descriptor(ctx, desc);
> +	} else {
> +		/* check CCIP (host) address is aligned to 4CL (256B) */
> +		alignment_offset = (type == HOST_TO_FPGA)
> +			? (src % CCIP_ALIGN_BYTES) : (dst %
> CCIP_ALIGN_BYTES);
> +		/* performing a short transfer to get aligned */
> +		if (alignment_offset != 0) {
> +			desc->rd_address = src & DMA_MASK_32_BIT;
> +			desc->wr_address = dst & DMA_MASK_32_BIT;
> +			desc->wr_burst_count = 1;
> +			desc->rd_burst_count = 1;
> +			desc->rd_address_ext = (src >> 32) &
> DMA_MASK_32_BIT;
> +			desc->wr_address_ext = (dst >> 32) &
> DMA_MASK_32_BIT;
> +			/* count isn't large enough to hit next 4CL boundary */
> +			if ((CCIP_ALIGN_BYTES - alignment_offset) >= count) {
> +				segment_size = count;
> +				count = 0;
> +			} else {
> +				segment_size = CCIP_ALIGN_BYTES
> +					- alignment_offset;
> +				src += segment_size;
> +				dst += segment_size;
> +				count -= segment_size;
> +				desc->control.transfer_irq_en = 0;
> +			}
> +			/* post short transfer to align to a 4CL (256 byte) */
> +			desc->len = segment_size;
> +			send_descriptor(ctx, desc);
> +		}
> +		/* at this point we are 4CL (256 byte) aligned */
> +		if (count >= CCIP_ALIGN_BYTES) {
> +			desc->rd_address = src & DMA_MASK_32_BIT;
> +			desc->wr_address = dst & DMA_MASK_32_BIT;
> +			desc->wr_burst_count = 4;
> +			desc->rd_burst_count = 4;
> +			desc->rd_address_ext = (src >> 32) &
> DMA_MASK_32_BIT;
> +			desc->wr_address_ext = (dst >> 32) &
> DMA_MASK_32_BIT;
> +			/* buffer ends on 4CL boundary */
> +			if ((count % CCIP_ALIGN_BYTES) == 0) {
> +				segment_size = count;
> +				count = 0;
> +			} else {
> +				segment_size = count
> +					- (count % CCIP_ALIGN_BYTES);
> +				src += segment_size;
> +				dst += segment_size;
> +				count -= segment_size;
> +				desc->control.transfer_irq_en = 0;
> +			}
> +			desc->len = segment_size;
> +			send_descriptor(ctx, desc);
> +		}
> +		/* post short transfer to handle the remainder */
> +		if (count > 0) {
> +			desc->rd_address = src & DMA_MASK_32_BIT;
> +			desc->wr_address = dst & DMA_MASK_32_BIT;
> +			desc->len = count;
> +			desc->wr_burst_count = 1;
> +			desc->rd_burst_count = 1;
> +			desc->rd_address_ext = (src >> 32) &
> DMA_MASK_32_BIT;
> +			desc->wr_address_ext = (dst >> 32) &
> DMA_MASK_32_BIT;
> +			if (intr_en)
> +				desc->control.transfer_irq_en = 1;
> +			send_descriptor(ctx, desc);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int issue_magic(struct dma_afu_ctx *ctx)
> +{
> +	*(ctx->magic_buf) = 0ULL;
> +	return do_dma(ctx, DMA_WF_HOST_ADDR(ctx->magic_iova),
> +		DMA_WF_MAGIC_ROM, 64, 1, FPGA_TO_HOST, 1);
> +}
> +
> +static void wait_magic(struct dma_afu_ctx *ctx)
> +{
> +	int magic_timeout = 0;
> +
> +	if (!ctx)
> +		return;
> +
> +	poll_interrupt(ctx);
> +	while (*(ctx->magic_buf) != DMA_WF_MAGIC) {
> +		if (magic_timeout++ > 1000) {
> +			AFU_MF_PMD_ERR("DMA magic operation timeout");
> +			magic_timeout = 0;
> +			break;
> +		}
> +	}
> +	*(ctx->magic_buf) = 0ULL;
> +}
> +
> +static int dma_tx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> +	uint64_t chunk, int is_last_chunk, int *intr_issued)
> +{
> +	int intr_en = 0;
> +	int ret = 0;
> +
> +	if (!ctx || !intr_issued)
> +		return -EINVAL;
> +
> +	src += chunk * ctx->dma_buf_size;
> +	dst += chunk * ctx->dma_buf_size;
> +
> +	if (((chunk % HALF_DMA_BUF) == (HALF_DMA_BUF - 1)) ||
> is_last_chunk) {
> +		if (*intr_issued) {
> +			ret = poll_interrupt(ctx);
> +			if (ret)
> +				return ret;
> +		}
> +		intr_en = 1;
> +	}
> +
> +	chunk %= NUM_DMA_BUF;
> +	rte_memcpy(ctx->dma_buf[chunk], (void *)(uintptr_t)src,
> +		ctx->dma_buf_size);
> +	ret = do_dma(ctx, dst, DMA_HOST_ADDR(ctx->dma_iova[chunk]),
> +			ctx->dma_buf_size, 0, HOST_TO_FPGA, intr_en);
> +	if (intr_en)
> +		*intr_issued = 1;
> +
> +	return ret;
> +}
> +
> +static int dma_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t
> src,
> +	size_t count)
> +{
> +	uint64_t i = 0;
> +	uint64_t count_left = count;
> +	uint64_t aligned_addr = 0;
> +	uint64_t align_bytes = 0;
> +	uint64_t dma_chunks = 0;
> +	uint64_t dma_tx_bytes = 0;
> +	uint64_t offset = 0;
> +	int issued_intr = 0;
> +	int ret = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
> +		count);
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	if (!IS_DMA_ALIGNED(dst)) {
> +		if (count_left < DMA_ALIGN_BYTES)
> +			return ase_host_to_fpga(ctx, &dst, &src, count_left);
> +
> +		aligned_addr = ((dst / DMA_ALIGN_BYTES) + 1)
> +			* DMA_ALIGN_BYTES;
> +		align_bytes = aligned_addr - dst;
> +		ret = ase_host_to_fpga(ctx, &dst, &src, align_bytes);
> +		if (ret)
> +			return ret;
> +		count_left = count_left - align_bytes;
> +	}
> +
> +	if (count_left) {
> +		dma_chunks = count_left / ctx->dma_buf_size;
> +		offset = dma_chunks * ctx->dma_buf_size;
> +		count_left -= offset;
> +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> +			" (%"PRIu64"...0x%"PRIx64")",
> +			src, dst, dma_chunks, count_left);
> +		for (i = 0; i < dma_chunks; i++) {
> +			ret = dma_tx_buf(ctx, dst, src, i,
> +				i == (dma_chunks - 1), &issued_intr);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		if (issued_intr) {
> +			ret = poll_interrupt(ctx);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		if (count_left) {
> +			i = count_left / DMA_ALIGN_BYTES;
> +			if (i > 0) {
> +				dma_tx_bytes = i * DMA_ALIGN_BYTES;
> +				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64"
> to DMA",
> +					dma_tx_bytes);
> +				rte_memcpy(ctx->dma_buf[0],
> +					(void *)(uintptr_t)(src + offset),
> +					dma_tx_bytes);
> +				ret = do_dma(ctx, dst + offset,
> +					DMA_HOST_ADDR(ctx->dma_iova[0]),
> +					dma_tx_bytes, 1, HOST_TO_FPGA, 1);
> +				if (ret)
> +					return ret;
> +				ret = poll_interrupt(ctx);
> +				if (ret)
> +					return ret;
> +			}
> +
> +			count_left -= dma_tx_bytes;
> +			if (count_left) {
> +				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64"
> to ASE",
> +					count_left);
> +				dst += offset + dma_tx_bytes;
> +				src += offset + dma_tx_bytes;
> +				ret = ase_host_to_fpga(ctx, &dst, &src,
> +					count_left);
> +			}
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int dma_rx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> +	uint64_t chunk, int is_last_chunk, uint64_t *rx_count, int *wf_issued)
> +{
> +	uint64_t i = chunk % NUM_DMA_BUF;
> +	uint64_t n = *rx_count;
> +	uint64_t num_pending = 0;
> +	int ret = 0;
> +
> +	if (!ctx || !wf_issued)
> +		return -EINVAL;
> +
> +	ret = do_dma(ctx, DMA_HOST_ADDR(ctx->dma_iova[i]),
> +		src + chunk * ctx->dma_buf_size,
> +		ctx->dma_buf_size, 1, FPGA_TO_HOST, 0);
> +	if (ret)
> +		return ret;
> +
> +	num_pending = chunk - n + 1;
> +	if (num_pending == HALF_DMA_BUF) {
> +		ret = issue_magic(ctx);
> +		if (ret) {
> +			AFU_MF_PMD_DEBUG("Magic issue failed");
> +			return ret;
> +		}
> +		*wf_issued = 1;
> +	}
> +
> +	if ((num_pending > (NUM_DMA_BUF - 1)) || is_last_chunk) {
> +		if (*wf_issued) {
> +			wait_magic(ctx);
> +			for (i = 0; i < HALF_DMA_BUF; i++) {
> +				rte_memcpy((void *)(uintptr_t)(dst +
> +						n * ctx->dma_buf_size),
> +					ctx->dma_buf[n % NUM_DMA_BUF],
> +					ctx->dma_buf_size);
> +				n++;
> +			}
> +			*wf_issued = 0;
> +			*rx_count = n;
> +		}
> +		ret = issue_magic(ctx);
> +		if (ret) {
> +			AFU_MF_PMD_DEBUG("Magic issue failed");
> +			return ret;
> +		}
> +		*wf_issued = 1;
> +	}
> +
> +	return ret;
> +}
> +
> +static int dma_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t
> src,
> +	size_t count)
> +{
> +	uint64_t i = 0;
> +	uint64_t count_left = count;
> +	uint64_t aligned_addr = 0;
> +	uint64_t align_bytes = 0;
> +	uint64_t dma_chunks = 0;
> +	uint64_t pending_buf = 0;
> +	uint64_t dma_rx_bytes = 0;
> +	uint64_t offset = 0;
> +	int wf_issued = 0;
> +	int ret = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
> +		count);
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	if (!IS_DMA_ALIGNED(src)) {
> +		if (count_left < DMA_ALIGN_BYTES)
> +			return ase_fpga_to_host(ctx, &src, &dst, count_left);
> +
> +		aligned_addr = ((src / DMA_ALIGN_BYTES) + 1)
> +			 * DMA_ALIGN_BYTES;
> +		align_bytes = aligned_addr - src;
> +		ret = ase_fpga_to_host(ctx, &src, &dst, align_bytes);
> +		if (ret)
> +			return ret;
> +		count_left = count_left - align_bytes;
> +	}
> +
> +	if (count_left) {
> +		dma_chunks = count_left / ctx->dma_buf_size;
> +		offset = dma_chunks * ctx->dma_buf_size;
> +		count_left -= offset;
> +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> +			" (%"PRIu64"...0x%"PRIx64")",
> +			src, dst, dma_chunks, count_left);
> +		for (i = 0; i < dma_chunks; i++) {
> +			ret = dma_rx_buf(ctx, dst, src, i,
> +				i == (dma_chunks - 1),
> +				&pending_buf, &wf_issued);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		if (wf_issued)
> +			wait_magic(ctx);
> +
> +		/* clear out final dma memcpy operations */
> +		while (pending_buf < dma_chunks) {
> +			/* constant size transfer; no length check required */
> +			rte_memcpy((void *)(uintptr_t)(dst +
> +					pending_buf * ctx->dma_buf_size),
> +				ctx->dma_buf[pending_buf %
> NUM_DMA_BUF],
> +				ctx->dma_buf_size);
> +			pending_buf++;
> +		}
> +
> +		if (count_left > 0) {
> +			i = count_left / DMA_ALIGN_BYTES;
> +			if (i > 0) {
> +				dma_rx_bytes = i * DMA_ALIGN_BYTES;
> +				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64"
> to DMA",
> +					dma_rx_bytes);
> +				ret = do_dma(ctx,
> +					DMA_HOST_ADDR(ctx->dma_iova[0]),
> +					src + offset,
> +					dma_rx_bytes, 1, FPGA_TO_HOST, 0);
> +				if (ret)
> +					return ret;
> +				ret = issue_magic(ctx);
> +				if (ret)
> +					return ret;
> +				wait_magic(ctx);
> +				rte_memcpy((void *)(uintptr_t)(dst + offset),
> +					ctx->dma_buf[0], dma_rx_bytes);
> +			}
> +
> +			count_left -= dma_rx_bytes;
> +			if (count_left) {
> +				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64"
> to ASE",
> +					count_left);
> +				dst += offset + dma_rx_bytes;
> +				src += offset + dma_rx_bytes;
> +				ret = ase_fpga_to_host(ctx, &src, &dst,
> +							count_left);
> +			}
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int dma_fpga_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t
> src,
> +	size_t count)
> +{
> +	uint64_t i = 0;
> +	uint64_t count_left = count;
> +	uint64_t dma_chunks = 0;
> +	uint64_t offset = 0;
> +	uint32_t tx_chunks = 0;
> +	uint64_t *tmp_buf = NULL;
> +	int ret = 0;
> +
> +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
> +		count);
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src)
> +	    && IS_DMA_ALIGNED(count_left)) {
> +		dma_chunks = count_left / ctx->dma_buf_size;
> +		offset = dma_chunks * ctx->dma_buf_size;
> +		count_left -= offset;
> +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> +			" (%"PRIu64"...0x%"PRIx64")",
> +			src, dst, dma_chunks, count_left);
> +		for (i = 0; i < dma_chunks; i++) {
> +			ret = do_dma(ctx, dst + i * ctx->dma_buf_size,
> +				src + i * ctx->dma_buf_size,
> +				ctx->dma_buf_size, 0, FPGA_TO_FPGA, 0);
> +			if (ret)
> +				return ret;
> +			if ((((i + 1) % NUM_DMA_BUF) == 0) ||
> +				(i == (dma_chunks - 1))) {
> +				ret = issue_magic(ctx);
> +				if (ret)
> +					return ret;
> +				wait_magic(ctx);
> +			}
> +		}
> +
> +		if (count_left > 0) {
> +			AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to
> DMA", count_left);
> +			ret = do_dma(ctx, dst + offset, src + offset,
> +				count_left, 1, FPGA_TO_FPGA, 0);
> +			if (ret)
> +				return ret;
> +			ret = issue_magic(ctx);
> +			if (ret)
> +				return ret;
> +			wait_magic(ctx);
> +		}
> +	} else {
> +		if ((src < dst) && (src + count_left > dst)) {
> +			AFU_MF_PMD_ERR("Overlapping: 0x%"PRIx64
> +				" -> 0x%"PRIx64" (0x%"PRIx64")",
> +				src, dst, count_left);
> +			return -EINVAL;
> +		}
> +		tx_chunks = count_left / ctx->dma_buf_size;
> +		offset = tx_chunks * ctx->dma_buf_size;
> +		count_left -= offset;
> +		AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64
> +			" (%u...0x%"PRIx64")",
> +			src, dst, tx_chunks, count_left);
> +		tmp_buf = (uint64_t *)rte_malloc(NULL, ctx->dma_buf_size,
> +			DMA_ALIGN_BYTES);
> +		for (i = 0; i < tx_chunks; i++) {
> +			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
> +				src + i * ctx->dma_buf_size,
> +				ctx->dma_buf_size);
> +			if (ret)
> +				goto free_buf;
> +			ret = dma_host_to_fpga(ctx,
> +				dst + i * ctx->dma_buf_size,
> +				(uint64_t)tmp_buf, ctx->dma_buf_size);
> +			if (ret)
> +				goto free_buf;
> +		}
> +
> +		if (count_left > 0) {
> +			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
> +				src + offset, count_left);
> +			if (ret)
> +				goto free_buf;
> +			ret = dma_host_to_fpga(ctx, dst + offset,
> +				(uint64_t)tmp_buf, count_left);
> +			if (ret)
> +				goto free_buf;
> +		}
> +free_buf:
> +		rte_free(tmp_buf);
> +	}
> +
> +	return ret;
> +}
> +
> +static int dma_transfer_sync(struct dma_afu_ctx *ctx, uint64_t dst,
> +	uint64_t src, size_t count, fpga_dma_type type)
> +{
> +	int ret = 0;
> +
> +	if (!ctx)
> +		return -EINVAL;
> +
> +	if (type == HOST_TO_FPGA)
> +		ret = dma_host_to_fpga(ctx, dst, src, count);
> +	else if (type == FPGA_TO_HOST)
> +		ret = dma_fpga_to_host(ctx, dst, src, count);
> +	else if (type == FPGA_TO_FPGA)
> +		ret = dma_fpga_to_fpga(ctx, dst, src, count);
> +	else
> +		return -EINVAL;
> +
> +	return ret;
> +}
> +
> +static double getTime(struct timespec start, struct timespec end)
> +{
> +	uint64_t diff = 1000000000L * (end.tv_sec - start.tv_sec)
> +		+ end.tv_nsec - start.tv_nsec;
> +	return (double)diff / (double)1000000000L;
> +}
> +
> +#define SWEEP_ITERS 1
> +static int sweep_test(struct dma_afu_ctx *ctx, uint32_t length,
> +	uint64_t ddr_offset, uint64_t buf_offset, uint64_t size_decrement)
> +{
> +	struct timespec start, end;
> +	uint64_t test_size = 0;
> +	uint64_t *dma_buf_ptr = NULL;
> +	double throughput, total_time = 0.0;
> +	int i = 0;
> +	int ret = 0;
> +
> +	if (!ctx || !ctx->data_buf || !ctx->ref_buf) {
> +		AFU_MF_PMD_ERR("Buffer for DMA test is not allocated");
> +		return -EINVAL;
> +	}
> +
> +	if (length < (buf_offset + size_decrement)) {
> +		AFU_MF_PMD_ERR("Test length does not match unaligned
> parameter");
> +		return -EINVAL;
> +	}
> +	test_size = length - (buf_offset + size_decrement);
> +	if ((ddr_offset + test_size) > ctx->mem_size) {
> +		AFU_MF_PMD_ERR("Test is out of DDR memory space");
> +		return -EINVAL;
> +	}
> +
> +	dma_buf_ptr = (uint64_t *)((uint8_t *)ctx->data_buf + buf_offset);
> +	printf("Sweep Host %p to FPGA 0x%"PRIx64
> +		" with 0x%"PRIx64" bytes ...\n",
> +		(void *)dma_buf_ptr, ddr_offset, test_size);
> +
> +	for (i = 0; i < SWEEP_ITERS; i++) {
> +		clock_gettime(CLOCK_MONOTONIC, &start);
> +		ret = dma_transfer_sync(ctx, ddr_offset,
> (uint64_t)dma_buf_ptr,
> +			test_size, HOST_TO_FPGA);
> +		clock_gettime(CLOCK_MONOTONIC, &end);
> +		if (ret) {
> +			AFU_MF_PMD_ERR("Failed");
> +			return ret;
> +		}
> +		total_time += getTime(start, end);
> +	}
> +	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
> +	printf("Measured bandwidth = %lf MB/s\n", throughput);
> +
> +	printf("Sweep FPGA 0x%"PRIx64" to Host %p with 0x%"PRIx64" bytes
> ...\n",
> +		ddr_offset, (void *)dma_buf_ptr, test_size);
> +
> +	total_time = 0.0;
> +	memset((char *)dma_buf_ptr, 0, test_size);
> +	for (i = 0; i < SWEEP_ITERS; i++) {
> +		clock_gettime(CLOCK_MONOTONIC, &start);
> +		ret = dma_transfer_sync(ctx, (uint64_t)dma_buf_ptr,
> ddr_offset,
> +			test_size, FPGA_TO_HOST);
> +		clock_gettime(CLOCK_MONOTONIC, &end);
> +		if (ret) {
> +			AFU_MF_PMD_ERR("Failed");
> +			return ret;
> +		}
> +		total_time += getTime(start, end);
> +	}
> +	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
> +	printf("Measured bandwidth = %lf MB/s\n", throughput);
> +
> +	printf("Verifying buffer ...\n");
> +	return dma_afu_buf_verify(ctx, test_size);
> +}
> +
> +static int dma_afu_test(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct dma_afu_ctx *ctx = NULL;
> +	struct rte_pmd_afu_dma_cfg *cfg = NULL;
> +	msgdma_ctrl ctrl;
> +	uint64_t offset = 0;
> +	uint32_t i = 0;
> +	int ret = 0;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	if (!dev->priv)
> +		return -ENOENT;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	cfg = &priv->dma_cfg;
> +	if (cfg->index >= NUM_N3000_DMA)
> +		return -EINVAL;
> +	ctx = &priv->dma_ctx[cfg->index];
> +
> +	ctx->pattern = (int)cfg->pattern;
> +	ctx->verbose = (int)cfg->verbose;
> +	ctx->dma_buf_size = cfg->size;
> +
> +	ret = dma_afu_buf_alloc(ctx, cfg);
> +	if (ret)
> +		goto free;
> +
> +	printf("Initialize test buffer\n");
> +	dma_afu_buf_init(ctx, cfg->length);
> +
> +	/* enable interrupt */
> +	ctrl.csr = 0;
> +	ctrl.global_intr_en_mask = 1;
> +	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
> +
> +	printf("Host %p to FPGA 0x%x with 0x%x bytes\n", ctx->data_buf,
> +		cfg->offset, cfg->length);
> +	ret = dma_transfer_sync(ctx, cfg->offset, (uint64_t)ctx->data_buf,
> +		cfg->length, HOST_TO_FPGA);
> +	if (ret) {
> +		AFU_MF_PMD_ERR("Failed to transfer data from host to
> FPGA");
> +		goto end;
> +	}
> +	memset(ctx->data_buf, 0, cfg->length);
> +
> +	printf("FPGA 0x%x to Host %p with 0x%x bytes\n", cfg->offset,
> +		ctx->data_buf, cfg->length);
> +	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, cfg->offset,
> +		cfg->length, FPGA_TO_HOST);
> +	if (ret) {
> +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> host");
> +		goto end;
> +	}
> +	ret = dma_afu_buf_verify(ctx, cfg->length);
> +	if (ret)
> +		goto end;
> +
> +	if ((cfg->offset + cfg->length * 2) <= ctx->mem_size)
> +		offset = cfg->offset + cfg->length;
> +	else if (cfg->offset > cfg->length)
> +		offset = 0;
> +	else
> +		goto end;
> +
> +	printf("FPGA 0x%x to FPGA 0x%"PRIx64" with 0x%x bytes\n",
> +		cfg->offset, offset, cfg->length);
> +	ret = dma_transfer_sync(ctx, offset, cfg->offset, cfg->length,
> +		FPGA_TO_FPGA);
> +	if (ret) {
> +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> FPGA");
> +		goto end;
> +	}
> +
> +	printf("FPGA 0x%"PRIx64" to Host %p with 0x%x bytes\n", offset,
> +		ctx->data_buf, cfg->length);
> +	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, offset,
> +		cfg->length, FPGA_TO_HOST);
> +	if (ret) {
> +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> host");
> +		goto end;
> +	}
> +	ret = dma_afu_buf_verify(ctx, cfg->length);
> +	if (ret)
> +		goto end;
> +
> +	printf("Sweep with aligned address and size\n");
> +	ret = sweep_test(ctx, cfg->length, cfg->offset, 0, 0);
> +	if (ret)
> +		goto end;
> +
> +	if (cfg->unaligned) {
> +		printf("Sweep with unaligned address and size\n");
> +		struct unaligned_set {
> +			uint64_t addr_offset;
> +			uint64_t size_dec;
> +		} param[] = {{61, 5}, {3, 0}, {7, 3}, {0, 3}, {0, 61}, {0, 7}};
> +		for (i = 0; i < ARRAY_SIZE(param); i++) {
> +			ret = sweep_test(ctx, cfg->length, cfg->offset,
> +				param[i].addr_offset, param[i].size_dec);
> +			if (ret)
> +				break;
> +		}
> +	}
> +
> +end:
> +	/* disable interrupt */
> +	ctrl.global_intr_en_mask = 0;
> +	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
> +
> +free:
> +	dma_afu_buf_free(ctx);
> +	return ret;
> +}
> +
> +static struct rte_pci_device *n3000_afu_get_pci_dev(struct afu_mf_rawdev
> *dev)
> +{
> +	struct rte_afu_device *afudev = NULL;
> +
> +	if (!dev || !dev->rawdev || !dev->rawdev->device)
> +		return NULL;
> +
> +	afudev = RTE_DEV_TO_AFU(dev->rawdev->device);
> +	if (!afudev->rawdev || !afudev->rawdev->device)
> +		return NULL;
> +
> +	return RTE_DEV_TO_PCI(afudev->rawdev->device);
> +}
> +
> +#ifdef VFIO_PRESENT
> +static int dma_afu_set_irqs(struct afu_mf_rawdev *dev, uint32_t vec_start,
> +	uint32_t count, int *efds)
> +{
> +	struct rte_pci_device *pci_dev = NULL;
> +	struct vfio_irq_set *irq_set = NULL;
> +	int vfio_dev_fd = 0;
> +	size_t sz = 0;
> +	int ret = 0;
> +
> +	if (!dev || !efds || (count == 0) || (count > MAX_MSIX_VEC))
> +		return -EINVAL;
> +
> +	pci_dev = n3000_afu_get_pci_dev(dev);
> +	if (!pci_dev)
> +		return -ENODEV;
> +	vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle);
> +
> +	sz = sizeof(*irq_set) + sizeof(*efds) * count;
> +	irq_set = rte_zmalloc(NULL, sz, 0);
> +	if (!irq_set)
> +		return -ENOMEM;
> +
> +	irq_set->argsz = (uint32_t)sz;
> +	irq_set->count = count;
> +	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> +		VFIO_IRQ_SET_ACTION_TRIGGER;
> +	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
> +	irq_set->start = vec_start;
> +
> +	rte_memcpy(&irq_set->data, efds, sizeof(*efds) * count);
> +	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +	if (ret)
> +		AFU_MF_PMD_ERR("Error enabling MSI-X interrupts\n");
> +
> +	rte_free(irq_set);
> +	return ret;
> +}
> +#endif
> +
> +static void *n3000_afu_get_port_addr(struct afu_mf_rawdev *dev)
> +{
> +	struct rte_pci_device *pci_dev = NULL;
> +	uint8_t *addr = NULL;
> +	uint64_t val = 0;
> +	uint32_t bar = 0;
> +
> +	pci_dev = n3000_afu_get_pci_dev(dev);
> +	if (!pci_dev)
> +		return NULL;
> +
> +	addr = (uint8_t *)pci_dev->mem_resource[0].addr;
> +	val = rte_read64(addr + PORT_ATTR_REG(dev->port));
> +	if (!PORT_IMPLEMENTED(val)) {
> +		AFU_MF_PMD_INFO("FIU port %d is not implemented", dev-
> >port);
> +		return NULL;
> +	}
> +
> +	bar = PORT_BAR(val);
> +	if (bar >= PCI_MAX_RESOURCE) {
> +		AFU_MF_PMD_ERR("BAR index %u is out of limit", bar);
> +		return NULL;
> +	}
> +
> +	addr = (uint8_t *)pci_dev->mem_resource[bar].addr +
> PORT_OFFSET(val);
> +	return addr;
> +}
> +
> +static int n3000_afu_get_irq_capability(struct afu_mf_rawdev *dev,
> +	uint32_t *vec_start, uint32_t *vec_count)
> +{
> +	uint8_t *addr = NULL;
> +	uint64_t val = 0;
> +	uint64_t header = 0;
> +	uint64_t next_offset = 0;
> +
> +	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
> +	if (!addr)
> +		return -ENOENT;
> +
> +	do {
> +		addr += next_offset;
> +		header = rte_read64(addr);
> +		if ((DFH_TYPE(header) == DFH_TYPE_PRIVATE) &&
> +			(DFH_FEATURE_ID(header) ==
> PORT_FEATURE_UINT_ID)) {
> +			val = rte_read64(addr + PORT_UINT_CAP_REG);
> +			if (vec_start)
> +				*vec_start = PORT_VEC_START(val);
> +			if (vec_count)
> +				*vec_count = PORT_VEC_COUNT(val);
> +			return 0;
> +		}
> +		next_offset = DFH_NEXT_OFFSET(header);
> +		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
> +			break;
> +	} while (!DFH_EOL(header));
> +
> +	return -ENOENT;
> +}
> +
> +static int nlb_afu_ctx_release(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct nlb_afu_ctx *ctx = NULL;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	ctx = &priv->nlb_ctx;
> +
> +	rte_free(ctx->dsm_ptr);
> +	ctx->dsm_ptr = NULL;
> +	ctx->status_ptr = NULL;
> +
> +	rte_free(ctx->src_ptr);
> +	ctx->src_ptr = NULL;
> +
> +	rte_free(ctx->dest_ptr);
> +	ctx->dest_ptr = NULL;
> +
> +	return 0;
> +}
> +
> +static int nlb_afu_ctx_init(struct afu_mf_rawdev *dev, uint8_t *addr)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct nlb_afu_ctx *ctx = NULL;
> +	int ret = 0;
> +
> +	if (!dev || !addr)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	ctx = &priv->nlb_ctx;
> +	ctx->addr = addr;
> +
> +	ctx->dsm_ptr = (uint8_t *)rte_zmalloc(NULL, DSM_SIZE,
> TEST_MEM_ALIGN);
> +	if (!ctx->dsm_ptr) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +	ctx->dsm_iova = rte_malloc_virt2iova(ctx->dsm_ptr);
> +	if (ctx->dsm_iova == RTE_BAD_IOVA) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +
> +	ctx->src_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
> +		TEST_MEM_ALIGN);
> +	if (!ctx->src_ptr) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +	ctx->src_iova = rte_malloc_virt2iova(ctx->src_ptr);
> +	if (ctx->src_iova == RTE_BAD_IOVA) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +
> +	ctx->dest_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
> +		TEST_MEM_ALIGN);
> +	if (!ctx->dest_ptr) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}

Suppose that If ctx->dest_ptr fail, the nlb_afu_ctx_release() will work correct?

> +	ctx->dest_iova = rte_malloc_virt2iova(ctx->dest_ptr);
> +	if (ctx->dest_iova == RTE_BAD_IOVA) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +
> +	ctx->status_ptr = (struct nlb_dsm_status *)(ctx->dsm_ptr +
> DSM_STATUS);
> +	return 0;
> +
> +release:
> +	nlb_afu_ctx_release(dev);
> +	return ret;
> +}
> +
> +static int dma_afu_ctx_release(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct dma_afu_ctx *ctx = NULL;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	ctx = &priv->dma_ctx[0];
> +
> +	rte_free(ctx->desc_buf);
> +	ctx->desc_buf = NULL;
> +
> +	rte_free(ctx->magic_buf);
> +	ctx->magic_buf = NULL;
> +
> +	close(ctx->event_fd);
> +	return 0;
> +}
> +
> +static int dma_afu_ctx_init(struct afu_mf_rawdev *dev, int index, uint8_t
> *addr)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct dma_afu_ctx *ctx = NULL;
> +	uint64_t mem_sz[] = {0x100000000, 0x100000000, 0x40000000,
> 0x1000000};
> +	static int efds[1] = {0};
> +	uint32_t vec_start = 0;
> +	int ret = 0;
> +
> +	if (!dev || (index < 0) || (index >= NUM_N3000_DMA) || !addr)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	ctx = &priv->dma_ctx[index];
> +	ctx->index = index;
> +	ctx->addr = addr;
> +	ctx->csr_addr = addr + DMA_CSR;
> +	ctx->desc_addr = addr + DMA_DESC;
> +	ctx->ase_ctrl_addr = addr + DMA_ASE_CTRL;
> +	ctx->ase_data_addr = addr + DMA_ASE_DATA;
> +	ctx->mem_size = mem_sz[ctx->index];
> +	ctx->cur_ase_page = INVALID_ASE_PAGE;
> +	if (ctx->index == 0) {
> +		ret = n3000_afu_get_irq_capability(dev, &vec_start, NULL);
> +		if (ret)
> +			return ret;
> +
> +		efds[0] = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> +		if (efds[0] < 0) {
> +			AFU_MF_PMD_ERR("eventfd create failed");
> +			return -EBADF;
> +		}
> +#ifdef VFIO_PRESENT
> +		if (dma_afu_set_irqs(dev, vec_start, 1, efds))
> +			AFU_MF_PMD_ERR("DMA interrupt setup failed");
> +#endif
> +	}
> +	ctx->event_fd = efds[0];
> +
> +	ctx->desc_buf = (msgdma_ext_desc *)rte_zmalloc(NULL,
> +		sizeof(msgdma_ext_desc), DMA_ALIGN_BYTES);
> +	if (!ctx->desc_buf) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +
> +	ctx->magic_buf = (uint64_t *)rte_zmalloc(NULL, MAGIC_BUF_SIZE,
> +		TEST_MEM_ALIGN);
> +	if (!ctx->magic_buf) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}

Suppose that If ctx->magic_buf fail, the dma_afu_ctx_release () will work correct?

> +	ctx->magic_iova = rte_malloc_virt2iova(ctx->magic_buf);
> +	if (ctx->magic_iova == RTE_BAD_IOVA) {
> +		ret = -ENOMEM;
> +		goto release;
> +	}
> +
> +	return 0;
> +
> +release:
> +	dma_afu_ctx_release(dev);
> +	return ret;
> +}
> +
> +static int n3000_afu_ctx_init(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	uint8_t *addr = NULL;
> +	uint64_t header = 0;
> +	uint64_t uuid_hi = 0;
> +	uint64_t uuid_lo = 0;
> +	uint64_t next_offset = 0;
> +	int ret = 0;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	addr = (uint8_t *)dev->addr;
> +	do {
> +		addr += next_offset;
> +		header = rte_read64(addr);
> +		uuid_lo = rte_read64(addr + DFH_UUID_L_OFFSET);
> +		uuid_hi = rte_read64(addr + DFH_UUID_H_OFFSET);
> +
> +		if ((DFH_TYPE(header) == DFH_TYPE_AFU) &&
> +			(uuid_lo == N3000_NLB0_UUID_L) &&
> +			(uuid_hi == N3000_NLB0_UUID_H)) {
> +			AFU_MF_PMD_INFO("AFU NLB0 found @ %p", (void
> *)addr);
> +			ret = nlb_afu_ctx_init(dev, addr);
> +			if (ret)
> +				return ret;
> +		} else if ((DFH_TYPE(header) == DFH_TYPE_BBB) &&
> +			(uuid_lo == N3000_DMA_UUID_L) &&
> +			(uuid_hi == N3000_DMA_UUID_H) &&
> +			(priv->num_dma < NUM_N3000_DMA)) {
> +			AFU_MF_PMD_INFO("AFU DMA%d found @ %p",
> +				priv->num_dma, (void *)addr);
> +			ret = dma_afu_ctx_init(dev, priv->num_dma, addr);
> +			if (ret)
> +				return ret;
> +			priv->num_dma++;
> +		} else {
> +			AFU_MF_PMD_DEBUG("DFH: type %"PRIu64
> +				", uuid %016"PRIx64"%016"PRIx64,
> +				DFH_TYPE(header), uuid_hi, uuid_lo);
> +		}
> +
> +		next_offset = DFH_NEXT_OFFSET(header);
> +		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
> +			break;
> +	} while (!DFH_EOL(header));
> +
> +	return 0;
> +}
> +
> +static int n3000_afu_init(struct afu_mf_rawdev *dev)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +
> +	if (!dev->priv) {
> +		dev->priv = rte_zmalloc(NULL, sizeof(struct n3000_afu_priv), 0);
> +		if (!dev->priv)
> +			return -ENOMEM;
> +	}
> +
> +	return n3000_afu_ctx_init(dev);
> +}
> +
> +static int n3000_afu_config(struct afu_mf_rawdev *dev, void *config,
> +	size_t config_size)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	struct rte_pmd_afu_n3000_cfg *cfg = NULL;
> +	int i = 0;
> +	uint64_t top = 0;
> +
> +	if (!dev || !config || !config_size)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	if (config_size != sizeof(struct rte_pmd_afu_n3000_cfg))
> +		return -EINVAL;
> +
> +	cfg = (struct rte_pmd_afu_n3000_cfg *)config;
> +	if (cfg->type == RTE_PMD_AFU_N3000_NLB) {
> +		if (cfg->nlb_cfg.mode != NLB_MODE_LPBK)
> +			return -EINVAL;
> +		if ((cfg->nlb_cfg.read_vc > NLB_VC_RANDOM) ||
> +			(cfg->nlb_cfg.write_vc > NLB_VC_RANDOM))
> +			return -EINVAL;
> +		if (cfg->nlb_cfg.wrfence_vc > NLB_VC_VH1)
> +			return -EINVAL;
> +		if (cfg->nlb_cfg.cache_hint > NLB_RDLINE_MIXED)
> +			return -EINVAL;
> +		if (cfg->nlb_cfg.cache_policy > NLB_WRPUSH_I)
> +			return -EINVAL;
> +		if ((cfg->nlb_cfg.multi_cl != 1) &&
> +			(cfg->nlb_cfg.multi_cl != 2) &&
> +			(cfg->nlb_cfg.multi_cl != 4))
> +			return -EINVAL;
> +		if ((cfg->nlb_cfg.begin < MIN_CACHE_LINES) ||
> +			(cfg->nlb_cfg.begin > MAX_CACHE_LINES))
> +			return -EINVAL;
> +		if ((cfg->nlb_cfg.end < cfg->nlb_cfg.begin) ||
> +			(cfg->nlb_cfg.end > MAX_CACHE_LINES))
> +			return -EINVAL;
> +		rte_memcpy(&priv->nlb_cfg, &cfg->nlb_cfg,
> +			sizeof(struct rte_pmd_afu_nlb_cfg));
> +	} else if (cfg->type == RTE_PMD_AFU_N3000_DMA) {
> +		if (cfg->dma_cfg.index >= NUM_N3000_DMA)
> +			return -EINVAL;
> +		i = cfg->dma_cfg.index;
> +		if (cfg->dma_cfg.length > priv->dma_ctx[i].mem_size)
> +			return -EINVAL;
> +		if (cfg->dma_cfg.offset >= priv->dma_ctx[i].mem_size)
> +			return -EINVAL;
> +		top = cfg->dma_cfg.length + cfg->dma_cfg.offset;
> +		if ((top == 0) || (top > priv->dma_ctx[i].mem_size))
> +			return -EINVAL;
> +		if (i == 3) {  /* QDR connected to DMA3 */
> +			if (cfg->dma_cfg.length & 0x3f) {
> +				cfg->dma_cfg.length &= ~0x3f;
> +				AFU_MF_PMD_INFO("Round size to %x for
> QDR",
> +					cfg->dma_cfg.length);
> +			}
> +		}
> +		rte_memcpy(&priv->dma_cfg, &cfg->dma_cfg,
> +			sizeof(struct rte_pmd_afu_dma_cfg));
> +	} else {
> +		AFU_MF_PMD_ERR("Invalid type of N3000 AFU");
> +		return -EINVAL;
> +	}
> +
> +	priv->cfg_type = cfg->type;
> +	return 0;
> +}
> +
> +static int n3000_afu_test(struct afu_mf_rawdev *dev)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +	int ret = 0;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	if (!dev->priv)
> +		return -ENOENT;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +
> +	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
> +		AFU_MF_PMD_INFO("Test NLB");
> +		ret = nlb_afu_test(dev);
> +	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
> +		AFU_MF_PMD_INFO("Test DMA%u", priv->dma_cfg.index);
> +		ret = dma_afu_test(dev);
> +	} else {
> +		AFU_MF_PMD_ERR("Please configure AFU before test");
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +
> +static int n3000_afu_close(struct afu_mf_rawdev *dev)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +
> +	nlb_afu_ctx_release(dev);
> +	dma_afu_ctx_release(dev);
> +
> +	rte_free(dev->priv);
> +	dev->priv = NULL;
> +
> +	return 0;
> +}
> +
> +static int n3000_afu_dump(struct afu_mf_rawdev *dev, FILE *f)
> +{
> +	struct n3000_afu_priv *priv = NULL;
> +
> +	if (!dev)
> +		return -EINVAL;
> +
> +	priv = (struct n3000_afu_priv *)dev->priv;
> +	if (!priv)
> +		return -ENOENT;
> +
> +	if (!f)
> +		f = stdout;
> +
> +	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
> +		struct nlb_afu_ctx *ctx = &priv->nlb_ctx;
> +		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
> +		fprintf(f, "dsm_ptr:\t%p\n", (void *)ctx->dsm_ptr);
> +		fprintf(f, "dsm_iova:\t0x%"PRIx64"\n", ctx->dsm_iova);
> +		fprintf(f, "src_ptr:\t%p\n", (void *)ctx->src_ptr);
> +		fprintf(f, "src_iova:\t0x%"PRIx64"\n", ctx->src_iova);
> +		fprintf(f, "dest_ptr:\t%p\n", (void *)ctx->dest_ptr);
> +		fprintf(f, "dest_iova:\t0x%"PRIx64"\n", ctx->dest_iova);
> +		fprintf(f, "status_ptr:\t%p\n", (void *)ctx->status_ptr);
> +	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
> +		struct dma_afu_ctx *ctx = &priv->dma_ctx[priv-
> >dma_cfg.index];
> +		fprintf(f, "index:\t\t%d\n", ctx->index);
> +		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
> +		fprintf(f, "csr_addr:\t%p\n", (void *)ctx->csr_addr);
> +		fprintf(f, "desc_addr:\t%p\n", (void *)ctx->desc_addr);
> +		fprintf(f, "ase_ctrl_addr:\t%p\n", (void *)ctx->ase_ctrl_addr);
> +		fprintf(f, "ase_data_addr:\t%p\n", (void *)ctx->ase_data_addr);
> +		fprintf(f, "desc_buf:\t%p\n", (void *)ctx->desc_buf);
> +		fprintf(f, "magic_buf:\t%p\n", (void *)ctx->magic_buf);
> +		fprintf(f, "magic_iova:\t0x%"PRIx64"\n", ctx->magic_iova);
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int n3000_afu_reset(struct afu_mf_rawdev *dev)
> +{
> +	uint8_t *addr = NULL;
> +	uint64_t val = 0;
> +
> +	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
> +	if (!addr)
> +		return -ENOENT;
> +
> +	val = rte_read64(addr + PORT_CTRL_REG);
> +	val |= PORT_SOFT_RESET;
> +	rte_write64(val, addr + PORT_CTRL_REG);
> +	rte_delay_us(100);
> +	val &= ~PORT_SOFT_RESET;
> +	rte_write64(val, addr + PORT_CTRL_REG);
> +
> +	return 0;
> +}
> +
> +static struct afu_mf_ops n3000_afu_ops = {
> +	.init = n3000_afu_init,
> +	.config = n3000_afu_config,
> +	.start = NULL,
> +	.stop = NULL,
> +	.test = n3000_afu_test,
> +	.close = n3000_afu_close,
> +	.dump = n3000_afu_dump,
> +	.reset = n3000_afu_reset
> +};
> +
> +struct afu_mf_drv n3000_afu_drv = {
> +	.uuid = { N3000_AFU_UUID_L, N3000_AFU_UUID_H },
> +	.ops = &n3000_afu_ops
> +};
> diff --git a/drivers/raw/afu_mf/n3000_afu.h b/drivers/raw/afu_mf/n3000_afu.h
> new file mode 100644
> index 0000000..4c740da
> --- /dev/null
> +++ b/drivers/raw/afu_mf/n3000_afu.h
> @@ -0,0 +1,333 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#ifndef _N3000_AFU_H_
> +#define _N3000_AFU_H_
> +
> +#include "afu_mf_rawdev.h"
> +#include "rte_pmd_afu.h"
> +
> +#define N3000_AFU_UUID_L  0xc000c9660d824272
> +#define N3000_AFU_UUID_H  0x9aeffe5f84570612
> +#define N3000_NLB0_UUID_L 0xf89e433683f9040b
> +#define N3000_NLB0_UUID_H 0xd8424dc4a4a3c413
> +#define N3000_DMA_UUID_L  0xa9149a35bace01ea
> +#define N3000_DMA_UUID_H  0xef82def7f6ec40fc
> +
> +extern struct afu_mf_drv n3000_afu_drv;
> +
> +#define NUM_N3000_DMA  4
> +#define MAX_MSIX_VEC   7
> +
> +/* N3000 DFL definition */
> +#define DFH_UUID_L_OFFSET  8
> +#define DFH_UUID_H_OFFSET  16
> +#define DFH_TYPE(hdr)  (((hdr) >> 60) & 0xf)
> +#define DFH_TYPE_AFU  1
> +#define DFH_TYPE_BBB  2
> +#define DFH_TYPE_PRIVATE  3
> +#define DFH_EOL(hdr)  (((hdr) >> 40) & 0x1)
> +#define DFH_NEXT_OFFSET(hdr)  (((hdr) >> 16) & 0xffffff)
> +#define DFH_FEATURE_ID(hdr)  ((hdr) & 0xfff)
> +#define PORT_ATTR_REG(n)  (((n) << 3) + 0x38)
> +#define PORT_IMPLEMENTED(attr)  (((attr) >> 60) & 0x1)
> +#define PORT_BAR(attr)  (((attr) >> 32) & 0x7)
> +#define PORT_OFFSET(attr)  ((attr) & 0xffffff)
> +#define PORT_FEATURE_UINT_ID  0x12
> +#define PORT_UINT_CAP_REG  0x8
> +#define PORT_VEC_START(cap)  (((cap) >> 12) & 0xfff)
> +#define PORT_VEC_COUNT(cap)  ((cap) >> 12 & 0xfff)
> +#define PORT_CTRL_REG  0x38
> +#define PORT_SOFT_RESET  (0x1 << 0)
> +
> +/* NLB registers definition */
> +#define CSR_SCRATCHPAD0    0x100
> +#define CSR_SCRATCHPAD1    0x108
> +#define CSR_AFU_DSM_BASEL  0x110
> +#define CSR_AFU_DSM_BASEH  0x114
> +#define CSR_SRC_ADDR       0x120
> +#define CSR_DST_ADDR       0x128
> +#define CSR_NUM_LINES      0x130
> +#define CSR_CTL            0x138
> +#define CSR_CFG            0x140
> +#define CSR_INACT_THRESH   0x148
> +#define CSR_INTERRUPT0     0x150
> +#define CSR_SWTEST_MSG     0x158
> +#define CSR_STATUS0        0x160
> +#define CSR_STATUS1        0x168
> +#define CSR_ERROR          0x170
> +#define CSR_STRIDE         0x178
> +#define CSR_HE_INFO0       0x180
> +
> +#define DSM_SIZE           0x200000
> +#define DSM_STATUS         0x40
> +#define DSM_POLL_INTERVAL  5  /* ms */
> +#define DSM_TIMEOUT        1000  /* ms */
> +
> +#define NLB_BUF_SIZE  0x400000
> +#define TEST_MEM_ALIGN  1024
> +
> +struct nlb_csr_ctl {
> +	union {
> +		uint32_t csr;
> +		struct {
> +			uint32_t reset:1;
> +			uint32_t start:1;
> +			uint32_t force_completion:1;
> +			uint32_t reserved:29;
> +		};
> +	};
> +};
> +
> +struct nlb_csr_cfg {
> +	union {
> +		uint32_t csr;
> +		struct {
> +			uint32_t wrthru_en:1;
> +			uint32_t cont:1;
> +			uint32_t mode:3;
> +			uint32_t multicl_len:2;
> +			uint32_t rsvd1:1;
> +			uint32_t delay_en:1;
> +			uint32_t rdsel:2;
> +			uint32_t rsvd2:1;
> +			uint32_t chsel:3;
> +			uint32_t rsvd3:1;
> +			uint32_t wrpush_i:1;
> +			uint32_t wr_chsel:3;
> +			uint32_t rsvd4:3;
> +			uint32_t test_cfg:5;
> +			uint32_t interrupt_on_error:1;
> +			uint32_t interrupt_testmode:1;
> +			uint32_t wrfence_chsel:2;
> +		};
> +	};
> +};
> +
> +struct nlb_status0 {
> +	union {
> +		uint64_t csr;
> +		struct {
> +			uint32_t num_writes;
> +			uint32_t num_reads;
> +		};
> +	};
> +};
> +
> +struct nlb_status1 {
> +	union {
> +		uint64_t csr;
> +		struct {
> +			uint32_t num_pend_writes;
> +			uint32_t num_pend_reads;
> +		};
> +	};
> +};
> +
> +struct nlb_dsm_status {
> +	uint32_t test_complete;
> +	uint32_t test_error;
> +	uint64_t num_clocks;
> +	uint32_t num_reads;
> +	uint32_t num_writes;
> +	uint32_t start_overhead;
> +	uint32_t end_overhead;
> +};
> +
> +/* DMA registers definition */
> +#define DMA_CSR       0x40
> +#define DMA_DESC      0x60
> +#define DMA_ASE_CTRL  0x200
> +#define DMA_ASE_DATA  0x1000
> +
> +#define DMA_ASE_WINDOW       4096
> +#define DMA_ASE_WINDOW_MASK  ((uint64_t)(DMA_ASE_WINDOW - 1))
> +#define INVALID_ASE_PAGE     0xffffffffffffffffULL
> +
> +#define DMA_WF_MAGIC             0x5772745F53796E63ULL
> +#define DMA_WF_MAGIC_ROM         0x1000000000000
> +#define DMA_HOST_ADDR(addr)      ((addr) | 0x2000000000000)
> +#define DMA_WF_HOST_ADDR(addr)   ((addr) | 0x3000000000000)
> +
> +#define NUM_DMA_BUF   8
> +#define HALF_DMA_BUF  (NUM_DMA_BUF / 2)
> +
> +#define DMA_MASK_32_BIT 0xFFFFFFFF
> +
> +#define DMA_CSR_BUSY           0x1
> +#define DMA_DESC_BUFFER_EMPTY  0x2
> +#define DMA_DESC_BUFFER_FULL   0x4
> +
> +#define DWORD_BYTES 4
> +#define IS_ALIGNED_DWORD(addr) (((addr) % DWORD_BYTES) == 0)
> +
> +#define QWORD_BYTES 8
> +#define IS_ALIGNED_QWORD(addr) (((addr) % QWORD_BYTES) == 0)
> +
> +#define DMA_ALIGN_BYTES 64
> +#define IS_DMA_ALIGNED(addr) (((addr) % DMA_ALIGN_BYTES) == 0)
> +
> +#define CCIP_ALIGN_BYTES (DMA_ALIGN_BYTES << 2)
> +
> +#define DMA_TIMEOUT_MSEC  5000
> +
> +#define MAGIC_BUF_SIZE  64
> +#define ERR_CHECK_LIMIT  64
> +
> +#ifndef MIN
> +#define MIN(a, b) ((a) < (b) ? (a) : (b))
> +#endif
> +
> +#ifndef ARRAY_SIZE
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> +#endif
> +
> +typedef enum {
> +	HOST_TO_FPGA = 0,
> +	FPGA_TO_HOST,
> +	FPGA_TO_FPGA,
> +	FPGA_MAX_TRANSFER_TYPE,
> +} fpga_dma_type;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t tx_channel:8;
> +		uint32_t generate_sop:1;
> +		uint32_t generate_eop:1;
> +		uint32_t park_reads:1;
> +		uint32_t park_writes:1;
> +		uint32_t end_on_eop:1;
> +		uint32_t reserved_1:1;
> +		uint32_t transfer_irq_en:1;
> +		uint32_t early_term_irq_en:1;
> +		uint32_t trans_error_irq_en:8;
> +		uint32_t early_done_en:1;
> +		uint32_t reserved_2:6;
> +		uint32_t go:1;
> +	};
> +} msgdma_desc_ctrl;
> +
> +typedef struct __rte_packed {
> +	uint32_t rd_address;
> +	uint32_t wr_address;
> +	uint32_t len;
> +	uint16_t seq_num;
> +	uint8_t rd_burst_count;
> +	uint8_t wr_burst_count;
> +	uint16_t rd_stride;
> +	uint16_t wr_stride;
> +	uint32_t rd_address_ext;
> +	uint32_t wr_address_ext;
> +	msgdma_desc_ctrl control;
> +} msgdma_ext_desc;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t busy:1;
> +		uint32_t desc_buf_empty:1;
> +		uint32_t desc_buf_full:1;
> +		uint32_t rsp_buf_empty:1;
> +		uint32_t rsp_buf_full:1;
> +		uint32_t stopped:1;
> +		uint32_t resetting:1;
> +		uint32_t stopped_on_error:1;
> +		uint32_t stopped_on_early_term:1;
> +		uint32_t irq:1;
> +		uint32_t reserved:22;
> +	};
> +} msgdma_status;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t stop_dispatcher:1;
> +		uint32_t reset_dispatcher:1;
> +		uint32_t stop_on_error:1;
> +		uint32_t stopped_on_early_term:1;
> +		uint32_t global_intr_en_mask:1;
> +		uint32_t stop_descriptors:1;
> +		uint32_t reserved:22;
> +	};
> +} msgdma_ctrl;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t rd_fill_level:16;
> +		uint32_t wr_fill_level:16;
> +	};
> +} msgdma_fill_level;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t rsp_fill_level:16;
> +		uint32_t reserved:16;
> +	};
> +} msgdma_rsp_level;
> +
> +typedef union {
> +	uint32_t csr;
> +	struct {
> +		uint32_t rd_seq_num:16;
> +		uint32_t wr_seq_num:16;
> +	};
> +} msgdma_seq_num;
> +
> +typedef struct __rte_packed {
> +	msgdma_status status;
> +	msgdma_ctrl ctrl;
> +	msgdma_fill_level fill_level;
> +	msgdma_rsp_level rsp;
> +	msgdma_seq_num seq_num;
> +} msgdma_csr;
> +
> +#define CSR_STATUS(csr)   (&(((msgdma_csr *)(csr))->status))
> +#define CSR_CONTROL(csr)  (&(((msgdma_csr *)(csr))->ctrl))
> +
> +struct nlb_afu_ctx {
> +	uint8_t *addr;
> +	uint8_t *dsm_ptr;
> +	uint64_t dsm_iova;
> +	uint8_t *src_ptr;
> +	uint64_t src_iova;
> +	uint8_t *dest_ptr;
> +	uint64_t dest_iova;
> +	struct nlb_dsm_status *status_ptr;
> +};
> +
> +struct dma_afu_ctx {
> +	int index;
> +	uint8_t *addr;
> +	uint8_t *csr_addr;
> +	uint8_t *desc_addr;
> +	uint8_t *ase_ctrl_addr;
> +	uint8_t *ase_data_addr;
> +	uint64_t mem_size;
> +	uint64_t cur_ase_page;
> +	int event_fd;
> +	int verbose;
> +	int pattern;
> +	void *data_buf;
> +	void *ref_buf;
> +	msgdma_ext_desc *desc_buf;
> +	uint64_t *magic_buf;
> +	uint64_t magic_iova;
> +	uint32_t dma_buf_size;
> +	uint64_t *dma_buf[NUM_DMA_BUF];
> +	uint64_t dma_iova[NUM_DMA_BUF];
> +};
> +
> +struct n3000_afu_priv {
> +	struct rte_pmd_afu_nlb_cfg nlb_cfg;
> +	struct rte_pmd_afu_dma_cfg dma_cfg;
> +	struct nlb_afu_ctx nlb_ctx;
> +	struct dma_afu_ctx dma_ctx[NUM_N3000_DMA];
> +	int num_dma;
> +	int cfg_type;
> +};
> +
> +#endif /* _N3000_AFU_H_ */
> diff --git a/drivers/raw/afu_mf/rte_pmd_afu.h
> b/drivers/raw/afu_mf/rte_pmd_afu.h
> new file mode 100644
> index 0000000..f14a053
> --- /dev/null
> +++ b/drivers/raw/afu_mf/rte_pmd_afu.h
> @@ -0,0 +1,97 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2022 Intel Corporation
> + */
> +
> +#ifndef __RTE_PMD_AFU_H__
> +#define __RTE_PMD_AFU_H__
> +
> +/**
> + * @file rte_pmd_afu.h
> + *
> + * AFU PMD specific definitions.
> + *
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> + *
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <stdint.h>
> +
> +#define RTE_PMD_AFU_N3000_NLB   1
> +#define RTE_PMD_AFU_N3000_DMA   2
> +
> +#define NLB_MODE_LPBK      0
> +#define NLB_MODE_READ      1
> +#define NLB_MODE_WRITE     2
> +#define NLB_MODE_TRPUT     3
> +
> +#define NLB_VC_AUTO        0
> +#define NLB_VC_VL0         1
> +#define NLB_VC_VH0         2
> +#define NLB_VC_VH1         3
> +#define NLB_VC_RANDOM      4
> +
> +#define NLB_WRLINE_M       0
> +#define NLB_WRLINE_I       1
> +#define NLB_WRPUSH_I       2
> +
> +#define NLB_RDLINE_S       0
> +#define NLB_RDLINE_I       1
> +#define NLB_RDLINE_MIXED   2
> +
> +#define MIN_CACHE_LINES   1
> +#define MAX_CACHE_LINES   1024
> +
> +#define MIN_DMA_BUF_SIZE  64
> +#define MAX_DMA_BUF_SIZE  (1023 * 1024)
> +
> +/**
> + * NLB AFU configuration data structure.
> + */
> +struct rte_pmd_afu_nlb_cfg {
> +	uint32_t mode;
> +	uint32_t begin;
> +	uint32_t end;
> +	uint32_t multi_cl;
> +	uint32_t cont;
> +	uint32_t timeout;
> +	uint32_t cache_policy;
> +	uint32_t cache_hint;
> +	uint32_t read_vc;
> +	uint32_t write_vc;
> +	uint32_t wrfence_vc;
> +	uint32_t freq_mhz;
> +};
> +
> +/**
> + * DMA AFU configuration data structure.
> + */
> +struct rte_pmd_afu_dma_cfg {
> +	uint32_t index;     /* index of DMA controller */
> +	uint32_t length;    /* total length of data to DMA */
> +	uint32_t offset;    /* address offset of target memory */
> +	uint32_t size;      /* size of transfer buffer */
> +	uint32_t pattern;   /* data pattern to fill in test buffer */
> +	uint32_t unaligned; /* use unaligned address or length in sweep test */
> +	uint32_t verbose;   /* enable verbose error information in test */
> +};
> +
> +/**
> + * N3000 AFU configuration data structure.
> + */
> +struct rte_pmd_afu_n3000_cfg {
> +	int type;   /* RTE_PMD_AFU_N3000_NLB or
> RTE_PMD_AFU_N3000_DMA */
> +	union {
> +		struct rte_pmd_afu_nlb_cfg nlb_cfg;
> +		struct rte_pmd_afu_dma_cfg dma_cfg;
> +	};
> +};
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* __RTE_PMD_AFU_H__ */
> --
> 1.8.3.1
  
Wei Huang June 7, 2022, 2:40 a.m. UTC | #2
> -----Original Message-----
> From: Zhang, Tianfei <tianfei.zhang@intel.com>
> Sent: Monday, June 6, 2022 09:39
> To: Huang, Wei <wei.huang@intel.com>; dev@dpdk.org;
> thomas@monjalon.net; nipun.gupta@nxp.com; hemant.agrawal@nxp.com
> Cc: stable@dpdk.org; Xu, Rosen <rosen.xu@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: RE: [PATCH v5 2/5] raw/afu_mf: add N3000 AFU driver
> 
> 
> 
> > -----Original Message-----
> > From: Huang, Wei <wei.huang@intel.com>
> > Sent: Friday, May 27, 2022 1:37 PM
> > To: dev@dpdk.org; thomas@monjalon.net; nipun.gupta@nxp.com;
> > hemant.agrawal@nxp.com
> > Cc: stable@dpdk.org; Xu, Rosen <rosen.xu@intel.com>; Zhang, Tianfei
> > <tianfei.zhang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Huang,
> Wei
> > <wei.huang@intel.com>
> > Subject: [PATCH v5 2/5] raw/afu_mf: add N3000 AFU driver
> >
> > N3000 AFU includes NLB0 and DMA modules, NLB0 is used to test PCI bus
> > and DMA is used to test local memory.
> > This driver initialize the modules and report test result.
> >
> > Signed-off-by: Wei Huang <wei.huang@intel.com>
> > ---
> >  drivers/raw/afu_mf/afu_mf_rawdev.c |    4 +
> >  drivers/raw/afu_mf/afu_mf_rawdev.h |   18 +
> >  drivers/raw/afu_mf/meson.build     |    4 +-
> >  drivers/raw/afu_mf/n3000_afu.c     | 2005
> > ++++++++++++++++++++++++++++++++++++
> >  drivers/raw/afu_mf/n3000_afu.h     |  333 ++++++
> >  drivers/raw/afu_mf/rte_pmd_afu.h   |   97 ++
> >  6 files changed, 2460 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/raw/afu_mf/n3000_afu.c
> >  create mode 100644 drivers/raw/afu_mf/n3000_afu.h
> >  create mode 100644 drivers/raw/afu_mf/rte_pmd_afu.h
> >
> > diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.c
> > b/drivers/raw/afu_mf/afu_mf_rawdev.c
> > index 5be372a..7c18f3b 100644
> > --- a/drivers/raw/afu_mf/afu_mf_rawdev.c
> > +++ b/drivers/raw/afu_mf/afu_mf_rawdev.c
> > @@ -17,15 +17,19 @@
> >  #include <rte_memzone.h>
> >  #include <rte_rawdev_pmd.h>
> >
> > +#include "rte_pmd_afu.h"
> >  #include "afu_mf_rawdev.h"
> > +#include "n3000_afu.h"
> >
> >  #define AFU_MF_PMD_RAWDEV_NAME rawdev_afu_mf
> >
> >  static const struct rte_afu_uuid afu_uuid_map[] = {
> > +	{ N3000_AFU_UUID_L, N3000_AFU_UUID_H },
> >  	{ 0, 0 /* sentinel */ }
> >  };
> >
> >  static struct afu_mf_drv *afu_table[] = {
> > +	&n3000_afu_drv,
> >  	NULL
> >  };
> >
> > diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.h
> > b/drivers/raw/afu_mf/afu_mf_rawdev.h
> > index df6715c..5a66f6c 100644
> > --- a/drivers/raw/afu_mf/afu_mf_rawdev.h
> > +++ b/drivers/raw/afu_mf/afu_mf_rawdev.h
> > @@ -30,6 +30,24 @@
> >  #define AFU_MF_PMD_WARN(fmt, args...) \
> >  	AFU_MF_PMD_LOG(WARNING, fmt, ## args)
> >
> > +#define CLS_TO_SIZE(n)  ((n) << 6)  /* get size of n cache lines */
> > +#define SIZE_TO_CLS(s)  ((s) >> 6)  /* convert size to number of cache
> lines */
> > +#define MHZ(f)  ((f) * 1000000)
> > +
> > +#define dsm_poll_timeout(addr, val, cond, invl, timeout) \
> > +({                                                       \
> > +	uint64_t __wait = 0;                                 \
> > +	uint64_t __invl = (invl);                            \
> > +	uint64_t __timeout = (timeout);                      \
> > +	for (; __wait <= __timeout; __wait += __invl) {      \
> > +		(val) = *(addr);                                 \
> > +		if (cond)                                        \
> > +			break;                                       \
> > +		rte_delay_ms(__invl);                            \
> > +	}                                                    \
> > +	(cond) ? 0 : 1;                                      \
> > +})
> 
> Dsm means DMA?
DSM means 'DMA Status Memory'
> 
> > +
> >  struct afu_mf_rawdev;
> >
> >  struct afu_mf_ops {
> > diff --git a/drivers/raw/afu_mf/meson.build
> b/drivers/raw/afu_mf/meson.build
> > index 80526a2..8a989e3 100644
> > --- a/drivers/raw/afu_mf/meson.build
> > +++ b/drivers/raw/afu_mf/meson.build
> > @@ -2,4 +2,6 @@
> >  # Copyright 2022 Intel Corporation
> >
> >  deps += ['rawdev', 'bus_pci', 'bus_ifpga']
> > -sources = files('afu_mf_rawdev.c')
> > +sources = files('afu_mf_rawdev.c', 'n3000_afu.c')
> > +
> > +headers = files('rte_pmd_afu.h')
> > diff --git a/drivers/raw/afu_mf/n3000_afu.c
> b/drivers/raw/afu_mf/n3000_afu.c
> > new file mode 100644
> > index 0000000..19d7c54
> > --- /dev/null
> > +++ b/drivers/raw/afu_mf/n3000_afu.c
> > @@ -0,0 +1,2005 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation
> > + */
> > +
> > +#include <errno.h>
> > +#include <stdio.h>
> > +#include <stdint.h>
> > +#include <stdlib.h>
> > +#include <inttypes.h>
> > +#include <unistd.h>
> > +#include <fcntl.h>
> > +#include <poll.h>
> > +#include <sys/eventfd.h>
> > +#include <sys/ioctl.h>
> > +
> > +#include <rte_eal.h>
> > +#include <rte_malloc.h>
> > +#include <rte_memcpy.h>
> > +#include <rte_io.h>
> > +#include <rte_vfio.h>
> > +#include <rte_bus_pci.h>
> > +#include <rte_bus_ifpga.h>
> > +#include <rte_rawdev.h>
> > +
> > +#include "afu_mf_rawdev.h"
> > +#include "n3000_afu.h"
> > +
> > +static int nlb_afu_config(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> > +	struct nlb_csr_cfg v;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	if (!dev->priv)
> > +		return -ENOENT;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	cfg = &priv->nlb_cfg;
> > +
> > +	v.csr = 0;
> > +
> > +	if (cfg->cont)
> > +		v.cont = 1;
> > +
> > +	if (cfg->cache_policy == NLB_WRPUSH_I)
> > +		v.wrpush_i = 1;
> > +	else
> > +		v.wrthru_en = cfg->cache_policy;
> > +
> > +	if (cfg->cache_hint == NLB_RDLINE_MIXED)
> > +		v.rdsel = 3;
> > +	else
> > +		v.rdsel = cfg->cache_hint;
> > +
> > +	v.mode = cfg->mode;
> > +	v.chsel = cfg->read_vc;
> > +	v.wr_chsel = cfg->write_vc;
> > +	v.wrfence_chsel = cfg->wrfence_vc;
> > +	v.wrthru_en = cfg->cache_policy;
> > +	v.multicl_len = cfg->multi_cl - 1;
> > +
> > +	AFU_MF_PMD_DEBUG("cfg: 0x%08x", v.csr);
> > +	rte_write32(v.csr, priv->nlb_ctx.addr + CSR_CFG);
> > +
> > +	return 0;
> > +}
> > +
> > +static void nlb_afu_report(struct afu_mf_rawdev *dev, uint32_t cl)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> > +	struct nlb_dsm_status *stat = NULL;
> > +	uint64_t ticks = 0;
> > +	double num, rd_bw, wr_bw;
> > +
> > +	if (!dev || !dev->priv)
> > +		return;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +
> > +	cfg = &priv->nlb_cfg;
> > +	stat = priv->nlb_ctx.status_ptr;
> > +
> > +	if (cfg->cont)
> > +		ticks = stat->num_clocks - stat->start_overhead;
> > +	else
> > +		ticks = stat->num_clocks -
> > +			(stat->start_overhead + stat->end_overhead);
> > +
> > +	if (cfg->freq_mhz == 0)
> > +		cfg->freq_mhz = 200;
> > +
> > +	num = (double)stat->num_reads;
> > +	rd_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
> > +	num = (double)stat->num_writes;
> > +	wr_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
> > +
> > +	printf("Cachelines  Read_Count Write_Count Clocks@%uMHz   "
> > +		"Rd_Bandwidth   Wr_Bandwidth\n", cfg->freq_mhz);
> > +	printf("%10u  %10u %11u  %12"PRIu64"   %7.3f GB/s   %7.3f GB/s\n",
> > +		cl, stat->num_reads, stat->num_writes, ticks,
> > +		rd_bw / 1e9, wr_bw / 1e9);
> > +}
> > +
> > +static int nlb_afu_test(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct nlb_afu_ctx *ctx = NULL;
> > +	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
> > +	struct nlb_csr_ctl ctl;
> > +	uint32_t *ptr = NULL;
> > +	uint32_t i, j, cl, val = 0;
> > +	uint64_t sval = 0;
> > +	int ret = 0;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	if (!dev->priv)
> > +		return -ENOENT;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	ctx = &priv->nlb_ctx;
> > +	cfg = &priv->nlb_cfg;
> > +
> > +	/* initialize registers */
> > +	AFU_MF_PMD_DEBUG("dsm_addr: 0x%"PRIx64, ctx->dsm_iova);
> > +	rte_write64(ctx->dsm_iova, ctx->addr + CSR_AFU_DSM_BASEL);
> > +
> > +	ctl.csr = 0;
> > +	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +	ctl.reset = 1;
> > +	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +
> > +	AFU_MF_PMD_DEBUG("src_addr: 0x%"PRIx64, ctx->src_iova);
> > +	rte_write64(SIZE_TO_CLS(ctx->src_iova), ctx->addr +
> CSR_SRC_ADDR);
> > +	AFU_MF_PMD_DEBUG("dst_addr: 0x%"PRIx64, ctx->dest_iova);
> > +	rte_write64(SIZE_TO_CLS(ctx->dest_iova), ctx->addr +
> CSR_DST_ADDR);
> > +
> > +	ret = nlb_afu_config(dev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* initialize src data */
> > +	ptr = (uint32_t *)ctx->src_ptr;
> > +	j = CLS_TO_SIZE(cfg->end) >> 2;
> > +	for (i = 0; i < j; i++)
> > +		*ptr++ = i;
> > +
> > +	/* start test */
> > +	for (cl = cfg->begin; cl <= cfg->end; cl += cfg->multi_cl) {
> > +		memset(ctx->dest_ptr, 0, CLS_TO_SIZE(cl));
> > +		memset(ctx->dsm_ptr, 0, DSM_SIZE);
> > +
> > +		ctl.csr = 0;
> > +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +		ctl.reset = 1;
> > +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +
> > +		rte_write32(cl, ctx->addr + CSR_NUM_LINES);
> > +
> > +		rte_delay_us(10);
> > +
> > +		ctl.start = 1;
> > +		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +
> > +		if (cfg->cont) {
> > +			rte_delay_ms(cfg->timeout * 1000);
> > +			ctl.force_completion = 1;
> > +			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +			ret = dsm_poll_timeout(&ctx->status_ptr-
> > >test_complete,
> > +				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
> > +				DSM_TIMEOUT);
> > +			if (ret) {
> > +				printf("DSM poll timeout\n");
> > +				goto end;
> > +			}
> > +		} else {
> > +			ret = dsm_poll_timeout(&ctx->status_ptr-
> > >test_complete,
> > +				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
> > +				DSM_TIMEOUT);
> > +			if (ret) {
> > +				printf("DSM poll timeout\n");
> > +				goto end;
> > +			}
> > +			ctl.force_completion = 1;
> > +			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
> > +		}
> > +
> > +		nlb_afu_report(dev, cl);
> > +
> > +		i = 0;
> > +		while (i++ < 100) {
> > +			sval = rte_read64(ctx->addr + CSR_STATUS1);
> > +			if (sval == 0)
> > +				break;
> > +			rte_delay_us(1000);
> > +		}
> > +
> > +		ptr = (uint32_t *)ctx->dest_ptr;
> > +		j = CLS_TO_SIZE(cl) >> 2;
> > +		for (i = 0; i < j; i++) {
> > +			if (*ptr++ != i) {
> > +				AFU_MF_PMD_ERR("Data mismatch @ %u",
> i);
> > +				break;
> > +			}
> > +		}
> > +	}
> > +
> > +end:
> > +	return ret;
> > +}
> > +
> > +static void dma_afu_buf_free(struct dma_afu_ctx *ctx)
> > +{
> > +	int i = 0;
> > +
> > +	if (!ctx)
> > +		return;
> > +
> > +	for (i = 0; i < NUM_DMA_BUF; i++) {
> > +		rte_free(ctx->dma_buf[i]);
> > +		ctx->dma_buf[i] = NULL;
> > +	}
> > +
> > +	rte_free(ctx->data_buf);
> > +	ctx->data_buf = NULL;
> > +
> > +	rte_free(ctx->ref_buf);
> > +	ctx->ref_buf = NULL;
> > +}
> > +
> > +static int dma_afu_buf_alloc(struct dma_afu_ctx *ctx,
> > +	struct rte_pmd_afu_dma_cfg *cfg)
> > +{
> > +	size_t page_sz = sysconf(_SC_PAGE_SIZE);
> > +	int i, ret = 0;
> > +
> > +	if (!ctx || !cfg)
> > +		return -EINVAL;
> > +
> > +	for (i = 0; i < NUM_DMA_BUF; i++) {
> > +		ctx->dma_buf[i] = (uint64_t *)rte_zmalloc(NULL, cfg->size,
> > +			TEST_MEM_ALIGN);
> > +		if (!ctx->dma_buf[i]) {
> > +			ret = -ENOMEM;
> > +			goto free;
> > +		}
> > +		ctx->dma_iova[i] = rte_malloc_virt2iova(ctx->dma_buf[i]);
> > +		if (ctx->dma_iova[i] == RTE_BAD_IOVA) {
> > +			ret = -ENOMEM;
> > +			goto free;
> > +		}
> > +	}
> > +
> > +	ctx->data_buf = rte_malloc(NULL, cfg->length, page_sz);
> > +	if (!ctx->data_buf) {
> > +		ret = -ENOMEM;
> > +		goto free;
> > +	}
> > +
> > +	ctx->ref_buf = rte_malloc(NULL, cfg->length, page_sz);
> > +	if (!ctx->ref_buf) {
> > +		ret = -ENOMEM;
> > +		goto free;
> > +	}
> 
> Suppose that If ctx->ref_buf alloc fail, the dma_afu_buf_free() will work
> correct?
> 
dma_afu_buf_free() can work correct in failure situation, but it's not standard exception handle, I will change it.
> > +
> > +	return 0;
> > +
> > +free:
> > +	dma_afu_buf_free(ctx);
> > +	return ret;
> > +}
> > +
> > +static void dma_afu_buf_init(struct dma_afu_ctx *ctx, size_t size)
> > +{
> > +	int *ptr = NULL;
> > +	size_t i = 0;
> > +	size_t dword_size = 0;
> > +
> > +	if (!ctx || !size)
> > +		return;
> > +
> > +	ptr = (int *)ctx->ref_buf;
> > +
> > +	if (ctx->pattern) {
> > +		memset(ptr, ctx->pattern, size);
> > +	} else {
> > +		srand(99);
> > +		dword_size = size >> 2;
> > +		for (i = 0; i < dword_size; i++)
> > +			*ptr++ = rand();
> > +	}
> > +	rte_memcpy(ctx->data_buf, ctx->ref_buf, size);
> > +}
> > +
> > +static int dma_afu_buf_verify(struct dma_afu_ctx *ctx, size_t size)
> > +{
> > +	uint8_t *src = NULL;
> > +	uint8_t *dst = NULL;
> > +	size_t i = 0;
> > +	int n = 0;
> > +
> > +	if (!ctx || !size)
> > +		return -EINVAL;
> > +
> > +	src = (uint8_t *)ctx->ref_buf;
> > +	dst = (uint8_t *)ctx->data_buf;
> > +
> > +	if (memcmp(src, dst, size)) {
> > +		printf("Transfer is corrupted\n");
> > +		if (ctx->verbose) {
> > +			for (i = 0; i < size; i++) {
> > +				if (*src != *dst) {
> > +					if (++n >= ERR_CHECK_LIMIT)
> > +						break;
> > +					printf("Mismatch at 0x%zx, "
> > +						"Expected %02x  Actual
> > %02x\n",
> > +						i, *src, *dst);
> > +				}
> > +				src++;
> > +				dst++;
> > +			}
> > +			if (n < ERR_CHECK_LIMIT) {
> > +				printf("Found %d error bytes\n", n);
> > +			} else {
> > +				printf("......\n");
> > +				printf("Found more than %d error bytes\n",
> n);
> > +			}
> > +		}
> > +		return -1;
> > +	}
> > +
> > +	printf("Transfer is verified\n");
> > +	return 0;
> > +}
> > +
> > +static void blk_write64(uint64_t *dev_addr, uint64_t *host_addr,
> uint64_t
> > bytes)
> > +{
> > +	uint64_t qwords = bytes / sizeof(uint64_t);
> > +
> > +	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
> > +		!IS_ALIGNED_QWORD((uint64_t)bytes))
> > +		return;
> > +
> > +	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
> > +		rte_write64(*host_addr, dev_addr);
> > +}
> > +
> > +static void blk_read64(uint64_t *dev_addr, uint64_t *host_addr, uint64_t
> > bytes)
> > +{
> > +	uint64_t qwords = bytes / sizeof(uint64_t);
> > +
> > +	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
> > +		!IS_ALIGNED_QWORD((uint64_t)bytes))
> > +		return;
> > +
> > +	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
> > +		*host_addr = rte_read64(dev_addr);
> > +}
> > +
> > +static void switch_ase_page(struct dma_afu_ctx *ctx, uint64_t addr)
> > +{
> > +	uint64_t requested_page = addr & ~DMA_ASE_WINDOW_MASK;
> > +
> > +	if (!ctx)
> > +		return;
> > +
> > +	if (requested_page != ctx->cur_ase_page) {
> > +		rte_write64(requested_page, ctx->ase_ctrl_addr);
> > +		ctx->cur_ase_page = requested_page;
> > +	}
> > +}
> > +
> > +static int ase_write_unaligned(struct dma_afu_ctx *ctx, uint64_t
> dev_addr,
> > +	uint64_t host_addr, uint32_t count)
> > +{
> > +	uint64_t dev_aligned_addr = 0;
> > +	uint64_t shift = 0;
> > +	uint64_t val = 0;
> > +	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%x)",
> > host_addr,
> > +		dev_addr, count);
> > +
> > +	if (!ctx || (count >= QWORD_BYTES))
> > +		return -EINVAL;
> > +
> > +	if (!count)
> > +		return 0;
> > +
> > +	switch_ase_page(ctx, dev_addr);
> > +
> > +	shift = dev_addr % QWORD_BYTES;
> > +	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
> > +	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
> > +	rte_memcpy(((char *)(&val)) + shift, (void *)addr, count);
> > +
> > +	/* write back to device */
> > +	rte_write64(val, ctx->ase_data_addr + dev_aligned_addr);
> > +
> > +	return 0;
> > +}
> > +
> > +static int ase_write(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
> > +	uint64_t *src_ptr, uint64_t *count)
> > +{
> > +	uint64_t src = *src_ptr;
> > +	uint64_t dst = *dst_ptr;
> > +	uint64_t align_bytes = *count;
> > +	uint64_t offset = 0;
> > +	uint64_t left_in_page = DMA_ASE_WINDOW;
> > +	uint64_t size_to_copy = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64"
> (0x%"PRIx64")",
> > src, dst,
> > +		align_bytes);
> > +
> > +	if (!ctx || !IS_ALIGNED_DWORD(dst))
> > +		return -EINVAL;
> > +
> > +	if (align_bytes < DWORD_BYTES)
> > +		return 0;
> > +
> > +	if (!IS_ALIGNED_QWORD(dst)) {
> > +		/* Write out a single DWORD to get QWORD aligned */
> > +		switch_ase_page(ctx, dst);
> > +		offset = dst & DMA_ASE_WINDOW_MASK;
> > +
> > +		rte_write32(*(uint32_t *)(uintptr_t)src,
> > +			ctx->ase_data_addr + offset);
> > +		src += DWORD_BYTES;
> > +		dst += DWORD_BYTES;
> > +		align_bytes -= DWORD_BYTES;
> > +	}
> > +
> > +	if (!align_bytes)
> > +		return 0;
> > +
> > +	/* Write out blocks of 64-bit values */
> > +	while (align_bytes >= QWORD_BYTES) {
> > +		left_in_page -= dst & DMA_ASE_WINDOW_MASK;
> > +		size_to_copy =
> > +			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES -
> > 1)));
> > +		if (size_to_copy < QWORD_BYTES)
> > +			break;
> > +		switch_ase_page(ctx, dst);
> > +		offset = dst & DMA_ASE_WINDOW_MASK;
> > +		blk_write64((uint64_t *)(ctx->ase_data_addr + offset),
> > +			(uint64_t *)(uintptr_t)src, size_to_copy);
> > +		src += size_to_copy;
> > +		dst += size_to_copy;
> > +		align_bytes -= size_to_copy;
> > +	}
> > +
> > +	if (align_bytes >= DWORD_BYTES) {
> > +		/* Write out remaining DWORD */
> > +		switch_ase_page(ctx, dst);
> > +		offset = dst & DMA_ASE_WINDOW_MASK;
> > +		rte_write32(*(uint32_t *)(uintptr_t)src,
> > +			ctx->ase_data_addr + offset);
> > +		src += DWORD_BYTES;
> > +		dst += DWORD_BYTES;
> > +		align_bytes -= DWORD_BYTES;
> > +	}
> > +
> > +	*src_ptr = src;
> > +	*dst_ptr = dst;
> > +	*count = align_bytes;
> > +
> > +	return 0;
> > +}
> > +
> > +static int ase_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
> > +	uint64_t *src_ptr, uint64_t count)
> > +{
> > +	uint64_t dst = *dst_ptr;
> > +	uint64_t src = *src_ptr;
> > +	uint64_t count_left = count;
> > +	uint64_t unaligned_size = 0;
> > +	int ret = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64"
> (0x%"PRIx64")",
> > src, dst,
> > +		count);
> > +
> > +	/* aligns address to 8 byte using dst masking method */
> > +	if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
> > +		unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
> > +		if (unaligned_size > count_left)
> > +			unaligned_size = count_left;
> > +		ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
> > +		if (ret)
> > +			return ret;
> > +		count_left -= unaligned_size;
> > +		src += unaligned_size;
> > +		dst += unaligned_size;
> > +	}
> > +
> > +	/* Handles 8/4 byte MMIO transfer */
> > +	ret = ase_write(ctx, &dst, &src, &count_left);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Left over unaligned bytes transferred using dst masking method
> */
> > +	unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
> > +	if (unaligned_size > count_left)
> > +		unaligned_size = count_left;
> > +
> > +	ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	count_left -= unaligned_size;
> > +	*dst_ptr = dst + unaligned_size;
> > +	*src_ptr = src + unaligned_size;
> > +
> > +	return 0;
> > +}
> > +
> > +static int ase_read_unaligned(struct dma_afu_ctx *ctx, uint64_t
> dev_addr,
> > +	uint64_t host_addr, uint32_t count)
> > +{
> > +	uint64_t dev_aligned_addr = 0;
> > +	uint64_t shift = 0;
> > +	uint64_t val = 0;
> > +	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64" (0x%x)",
> > host_addr,
> > +		dev_addr, count);
> > +
> > +	if (!ctx || (count >= QWORD_BYTES))
> > +		return -EINVAL;
> > +
> > +	if (!count)
> > +		return 0;
> > +
> > +	switch_ase_page(ctx, dev_addr);
> > +
> > +	shift = dev_addr % QWORD_BYTES;
> > +	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
> > +	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
> > +	rte_memcpy((void *)addr, ((char *)(&val)) + shift, count);
> > +
> > +	return 0;
> > +}
> > +
> > +static int ase_read(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
> > +	uint64_t *dst_ptr, uint64_t *count)
> > +{
> > +	uint64_t src = *src_ptr;
> > +	uint64_t dst = *dst_ptr;
> > +	uint64_t align_bytes = *count;
> > +	uint64_t offset = 0;
> > +	uint64_t left_in_page = DMA_ASE_WINDOW;
> > +	uint64_t size_to_copy = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64"
> (0x%"PRIx64")",
> > dst, src,
> > +		align_bytes);
> > +
> > +	if (!ctx || !IS_ALIGNED_DWORD(src))
> > +		return -EINVAL;
> > +
> > +	if (align_bytes < DWORD_BYTES)
> > +		return 0;
> > +
> > +	if (!IS_ALIGNED_QWORD(src)) {
> > +		/* Read a single DWORD to get QWORD aligned */
> > +		switch_ase_page(ctx, src);
> > +		offset = src & DMA_ASE_WINDOW_MASK;
> > +		*(uint32_t *)(uintptr_t)dst =
> > +			rte_read32(ctx->ase_data_addr + offset);
> > +		src += DWORD_BYTES;
> > +		dst += DWORD_BYTES;
> > +		align_bytes -= DWORD_BYTES;
> > +	}
> > +
> > +	if (!align_bytes)
> > +		return 0;
> > +
> > +	/* Read blocks of 64-bit values */
> > +	while (align_bytes >= QWORD_BYTES) {
> > +		left_in_page -= src & DMA_ASE_WINDOW_MASK;
> > +		size_to_copy =
> > +			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES -
> > 1)));
> > +		if (size_to_copy < QWORD_BYTES)
> > +			break;
> > +		switch_ase_page(ctx, src);
> > +		offset = src & DMA_ASE_WINDOW_MASK;
> > +		blk_read64((uint64_t *)(ctx->ase_data_addr + offset),
> > +			(uint64_t *)(uintptr_t)dst, size_to_copy);
> > +		src += size_to_copy;
> > +		dst += size_to_copy;
> > +		align_bytes -= size_to_copy;
> > +	}
> > +
> > +	if (align_bytes >= DWORD_BYTES) {
> > +		/* Read remaining DWORD */
> > +		switch_ase_page(ctx, src);
> > +		offset = src & DMA_ASE_WINDOW_MASK;
> > +		*(uint32_t *)(uintptr_t)dst =
> > +			rte_read32(ctx->ase_data_addr + offset);
> > +		src += DWORD_BYTES;
> > +		dst += DWORD_BYTES;
> > +		align_bytes -= DWORD_BYTES;
> > +	}
> > +
> > +	*src_ptr = src;
> > +	*dst_ptr = dst;
> > +	*count = align_bytes;
> > +
> > +	return 0;
> > +}
> > +
> > +static int ase_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
> > +	uint64_t *dst_ptr, uint64_t count)
> > +{
> > +	uint64_t src = *src_ptr;
> > +	uint64_t dst = *dst_ptr;
> > +	uint64_t count_left = count;
> > +	uint64_t unaligned_size = 0;
> > +	int ret = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64"
> (0x%"PRIx64")",
> > src, dst,
> > +		count);
> > +
> > +	/* Aligns address to 8 byte using src masking method */
> > +	if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
> > +		unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
> > +		if (unaligned_size > count_left)
> > +			unaligned_size = count_left;
> > +		ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
> > +		if (ret)
> > +			return ret;
> > +		count_left -= unaligned_size;
> > +		dst += unaligned_size;
> > +		src += unaligned_size;
> > +	}
> > +
> > +	/* Handles 8/4 byte MMIO transfer */
> > +	ret = ase_read(ctx, &src, &dst, &count_left);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Left over unaligned bytes transferred using src masking method */
> > +	unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
> > +	if (unaligned_size > count_left)
> > +		unaligned_size = count_left;
> > +
> > +	ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	count_left -= unaligned_size;
> > +	*dst_ptr = dst + unaligned_size;
> > +	*src_ptr = src + unaligned_size;
> > +
> > +	return 0;
> > +}
> > +
> > +static void clear_interrupt(struct dma_afu_ctx *ctx)
> > +{
> > +	/* clear interrupt by writing 1 to IRQ bit in status register */
> > +	msgdma_status status;
> > +
> > +	if (!ctx)
> > +		return;
> > +
> > +	status.csr = 0;
> > +	status.irq = 1;
> > +	rte_write32(status.csr, CSR_STATUS(ctx->csr_addr));
> > +}
> > +
> > +static int poll_interrupt(struct dma_afu_ctx *ctx)
> > +{
> > +	struct pollfd pfd = {0};
> > +	uint64_t count = 0;
> > +	ssize_t bytes_read = 0;
> > +	int poll_ret = 0;
> > +	int ret = 0;
> > +
> > +	if (!ctx || (ctx->event_fd < 0))
> > +		return -EINVAL;
> > +
> > +	pfd.fd = ctx->event_fd;
> > +	pfd.events = POLLIN;
> > +	poll_ret = poll(&pfd, 1, DMA_TIMEOUT_MSEC);
> > +	if (poll_ret < 0) {
> > +		AFU_MF_PMD_ERR("Error %s", strerror(errno));
> > +		ret = -EFAULT;
> > +		goto out;
> > +	} else if (poll_ret == 0) {
> > +		AFU_MF_PMD_ERR("Timeout");
> > +		ret = -ETIMEDOUT;
> > +	} else {
> > +		bytes_read = read(pfd.fd, &count, sizeof(count));
> > +		if (bytes_read > 0) {
> > +			if (ctx->verbose)
> > +				AFU_MF_PMD_DEBUG("Successful, ret %d,
> cnt
> > %"PRIu64,
> > +					poll_ret, count);
> > +			ret = 0;
> > +		} else {
> > +			AFU_MF_PMD_ERR("Failed %s", bytes_read > 0 ?
> > +				strerror(errno) : "zero bytes read");
> > +			ret = -EIO;
> > +		}
> > +	}
> > +out:
> > +	clear_interrupt(ctx);
> > +	return ret;
> > +}
> > +
> > +static void send_descriptor(struct dma_afu_ctx *ctx, msgdma_ext_desc
> *desc)
> > +{
> > +	msgdma_status status;
> > +	uint64_t fpga_queue_full = 0;
> > +
> > +	if (!ctx)
> > +		return;
> > +
> > +	if (ctx->verbose) {
> > +		AFU_MF_PMD_DEBUG("descriptor.rd_address = 0x%x%08x",
> > +			desc->rd_address_ext, desc->rd_address);
> > +		AFU_MF_PMD_DEBUG("descriptor.wr_address =
> 0x%x%08x",
> > +			desc->wr_address_ext, desc->wr_address);
> > +		AFU_MF_PMD_DEBUG("descriptor.len = %u", desc->len);
> > +		AFU_MF_PMD_DEBUG("descriptor.wr_burst_count = %u",
> > +			desc->wr_burst_count);
> > +		AFU_MF_PMD_DEBUG("descriptor.rd_burst_count = %u",
> > +			desc->rd_burst_count);
> > +		AFU_MF_PMD_DEBUG("descriptor.wr_stride %u", desc-
> > >wr_stride);
> > +		AFU_MF_PMD_DEBUG("descriptor.rd_stride %u", desc-
> > >rd_stride);
> > +	}
> > +
> > +	do {
> > +		status.csr = rte_read32(CSR_STATUS(ctx->csr_addr));
> > +		if (fpga_queue_full++ > 100000000) {
> > +			AFU_MF_PMD_DEBUG("DMA queue full retry");
> > +			fpga_queue_full = 0;
> > +		}
> > +	} while (status.desc_buf_full);
> > +
> > +	blk_write64((uint64_t *)ctx->desc_addr, (uint64_t *)desc,
> > +		sizeof(*desc));
> > +}
> > +
> > +static int do_dma(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> > +	int count, int is_last_desc, fpga_dma_type type, int intr_en)
> > +{
> > +	msgdma_ext_desc *desc = NULL;
> > +	int alignment_offset = 0;
> > +	int segment_size = 0;
> > +
> > +	if (!ctx)
> > +		return -EINVAL;
> > +
> > +	/* src, dst and count must be 64-byte aligned */
> > +	if (!IS_DMA_ALIGNED(src) || !IS_DMA_ALIGNED(dst) ||
> > +		!IS_DMA_ALIGNED(count))
> > +		return -EINVAL;
> > +	memset(ctx->desc_buf, 0, sizeof(msgdma_ext_desc));
> > +
> > +	/* these fields are fixed for all DMA transfers */
> > +	desc = ctx->desc_buf;
> > +	desc->seq_num = 0;
> > +	desc->wr_stride = 1;
> > +	desc->rd_stride = 1;
> > +	desc->control.go = 1;
> > +	if (intr_en)
> > +		desc->control.transfer_irq_en = 1;
> > +	else
> > +		desc->control.transfer_irq_en = 0;
> > +
> > +	if (!is_last_desc)
> > +		desc->control.early_done_en = 1;
> > +	else
> > +		desc->control.early_done_en = 0;
> > +
> > +	if (type == FPGA_TO_FPGA) {
> > +		desc->rd_address = src & DMA_MASK_32_BIT;
> > +		desc->wr_address = dst & DMA_MASK_32_BIT;
> > +		desc->len = count;
> > +		desc->wr_burst_count = 4;
> > +		desc->rd_burst_count = 4;
> > +		desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
> > +		desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
> > +		send_descriptor(ctx, desc);
> > +	} else {
> > +		/* check CCIP (host) address is aligned to 4CL (256B) */
> > +		alignment_offset = (type == HOST_TO_FPGA)
> > +			? (src % CCIP_ALIGN_BYTES) : (dst %
> > CCIP_ALIGN_BYTES);
> > +		/* performing a short transfer to get aligned */
> > +		if (alignment_offset != 0) {
> > +			desc->rd_address = src & DMA_MASK_32_BIT;
> > +			desc->wr_address = dst & DMA_MASK_32_BIT;
> > +			desc->wr_burst_count = 1;
> > +			desc->rd_burst_count = 1;
> > +			desc->rd_address_ext = (src >> 32) &
> > DMA_MASK_32_BIT;
> > +			desc->wr_address_ext = (dst >> 32) &
> > DMA_MASK_32_BIT;
> > +			/* count isn't large enough to hit next 4CL boundary
> */
> > +			if ((CCIP_ALIGN_BYTES - alignment_offset) >= count)
> {
> > +				segment_size = count;
> > +				count = 0;
> > +			} else {
> > +				segment_size = CCIP_ALIGN_BYTES
> > +					- alignment_offset;
> > +				src += segment_size;
> > +				dst += segment_size;
> > +				count -= segment_size;
> > +				desc->control.transfer_irq_en = 0;
> > +			}
> > +			/* post short transfer to align to a 4CL (256 byte) */
> > +			desc->len = segment_size;
> > +			send_descriptor(ctx, desc);
> > +		}
> > +		/* at this point we are 4CL (256 byte) aligned */
> > +		if (count >= CCIP_ALIGN_BYTES) {
> > +			desc->rd_address = src & DMA_MASK_32_BIT;
> > +			desc->wr_address = dst & DMA_MASK_32_BIT;
> > +			desc->wr_burst_count = 4;
> > +			desc->rd_burst_count = 4;
> > +			desc->rd_address_ext = (src >> 32) &
> > DMA_MASK_32_BIT;
> > +			desc->wr_address_ext = (dst >> 32) &
> > DMA_MASK_32_BIT;
> > +			/* buffer ends on 4CL boundary */
> > +			if ((count % CCIP_ALIGN_BYTES) == 0) {
> > +				segment_size = count;
> > +				count = 0;
> > +			} else {
> > +				segment_size = count
> > +					- (count % CCIP_ALIGN_BYTES);
> > +				src += segment_size;
> > +				dst += segment_size;
> > +				count -= segment_size;
> > +				desc->control.transfer_irq_en = 0;
> > +			}
> > +			desc->len = segment_size;
> > +			send_descriptor(ctx, desc);
> > +		}
> > +		/* post short transfer to handle the remainder */
> > +		if (count > 0) {
> > +			desc->rd_address = src & DMA_MASK_32_BIT;
> > +			desc->wr_address = dst & DMA_MASK_32_BIT;
> > +			desc->len = count;
> > +			desc->wr_burst_count = 1;
> > +			desc->rd_burst_count = 1;
> > +			desc->rd_address_ext = (src >> 32) &
> > DMA_MASK_32_BIT;
> > +			desc->wr_address_ext = (dst >> 32) &
> > DMA_MASK_32_BIT;
> > +			if (intr_en)
> > +				desc->control.transfer_irq_en = 1;
> > +			send_descriptor(ctx, desc);
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int issue_magic(struct dma_afu_ctx *ctx)
> > +{
> > +	*(ctx->magic_buf) = 0ULL;
> > +	return do_dma(ctx, DMA_WF_HOST_ADDR(ctx->magic_iova),
> > +		DMA_WF_MAGIC_ROM, 64, 1, FPGA_TO_HOST, 1);
> > +}
> > +
> > +static void wait_magic(struct dma_afu_ctx *ctx)
> > +{
> > +	int magic_timeout = 0;
> > +
> > +	if (!ctx)
> > +		return;
> > +
> > +	poll_interrupt(ctx);
> > +	while (*(ctx->magic_buf) != DMA_WF_MAGIC) {
> > +		if (magic_timeout++ > 1000) {
> > +			AFU_MF_PMD_ERR("DMA magic operation
> timeout");
> > +			magic_timeout = 0;
> > +			break;
> > +		}
> > +	}
> > +	*(ctx->magic_buf) = 0ULL;
> > +}
> > +
> > +static int dma_tx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> > +	uint64_t chunk, int is_last_chunk, int *intr_issued)
> > +{
> > +	int intr_en = 0;
> > +	int ret = 0;
> > +
> > +	if (!ctx || !intr_issued)
> > +		return -EINVAL;
> > +
> > +	src += chunk * ctx->dma_buf_size;
> > +	dst += chunk * ctx->dma_buf_size;
> > +
> > +	if (((chunk % HALF_DMA_BUF) == (HALF_DMA_BUF - 1)) ||
> > is_last_chunk) {
> > +		if (*intr_issued) {
> > +			ret = poll_interrupt(ctx);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +		intr_en = 1;
> > +	}
> > +
> > +	chunk %= NUM_DMA_BUF;
> > +	rte_memcpy(ctx->dma_buf[chunk], (void *)(uintptr_t)src,
> > +		ctx->dma_buf_size);
> > +	ret = do_dma(ctx, dst, DMA_HOST_ADDR(ctx->dma_iova[chunk]),
> > +			ctx->dma_buf_size, 0, HOST_TO_FPGA, intr_en);
> > +	if (intr_en)
> > +		*intr_issued = 1;
> > +
> > +	return ret;
> > +}
> > +
> > +static int dma_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst,
> uint64_t
> > src,
> > +	size_t count)
> > +{
> > +	uint64_t i = 0;
> > +	uint64_t count_left = count;
> > +	uint64_t aligned_addr = 0;
> > +	uint64_t align_bytes = 0;
> > +	uint64_t dma_chunks = 0;
> > +	uint64_t dma_tx_bytes = 0;
> > +	uint64_t offset = 0;
> > +	int issued_intr = 0;
> > +	int ret = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src,
> dst,
> > +		count);
> > +
> > +	if (!ctx)
> > +		return -EINVAL;
> > +
> > +	if (!IS_DMA_ALIGNED(dst)) {
> > +		if (count_left < DMA_ALIGN_BYTES)
> > +			return ase_host_to_fpga(ctx, &dst, &src, count_left);
> > +
> > +		aligned_addr = ((dst / DMA_ALIGN_BYTES) + 1)
> > +			* DMA_ALIGN_BYTES;
> > +		align_bytes = aligned_addr - dst;
> > +		ret = ase_host_to_fpga(ctx, &dst, &src, align_bytes);
> > +		if (ret)
> > +			return ret;
> > +		count_left = count_left - align_bytes;
> > +	}
> > +
> > +	if (count_left) {
> > +		dma_chunks = count_left / ctx->dma_buf_size;
> > +		offset = dma_chunks * ctx->dma_buf_size;
> > +		count_left -= offset;
> > +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> > +			" (%"PRIu64"...0x%"PRIx64")",
> > +			src, dst, dma_chunks, count_left);
> > +		for (i = 0; i < dma_chunks; i++) {
> > +			ret = dma_tx_buf(ctx, dst, src, i,
> > +				i == (dma_chunks - 1), &issued_intr);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +
> > +		if (issued_intr) {
> > +			ret = poll_interrupt(ctx);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +
> > +		if (count_left) {
> > +			i = count_left / DMA_ALIGN_BYTES;
> > +			if (i > 0) {
> > +				dma_tx_bytes = i * DMA_ALIGN_BYTES;
> > +				AFU_MF_PMD_DEBUG("left over
> 0x%"PRIx64"
> > to DMA",
> > +					dma_tx_bytes);
> > +				rte_memcpy(ctx->dma_buf[0],
> > +					(void *)(uintptr_t)(src + offset),
> > +					dma_tx_bytes);
> > +				ret = do_dma(ctx, dst + offset,
> > +					DMA_HOST_ADDR(ctx-
> >dma_iova[0]),
> > +					dma_tx_bytes, 1, HOST_TO_FPGA, 1);
> > +				if (ret)
> > +					return ret;
> > +				ret = poll_interrupt(ctx);
> > +				if (ret)
> > +					return ret;
> > +			}
> > +
> > +			count_left -= dma_tx_bytes;
> > +			if (count_left) {
> > +				AFU_MF_PMD_DEBUG("left over
> 0x%"PRIx64"
> > to ASE",
> > +					count_left);
> > +				dst += offset + dma_tx_bytes;
> > +				src += offset + dma_tx_bytes;
> > +				ret = ase_host_to_fpga(ctx, &dst, &src,
> > +					count_left);
> > +			}
> > +		}
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static int dma_rx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
> > +	uint64_t chunk, int is_last_chunk, uint64_t *rx_count, int *wf_issued)
> > +{
> > +	uint64_t i = chunk % NUM_DMA_BUF;
> > +	uint64_t n = *rx_count;
> > +	uint64_t num_pending = 0;
> > +	int ret = 0;
> > +
> > +	if (!ctx || !wf_issued)
> > +		return -EINVAL;
> > +
> > +	ret = do_dma(ctx, DMA_HOST_ADDR(ctx->dma_iova[i]),
> > +		src + chunk * ctx->dma_buf_size,
> > +		ctx->dma_buf_size, 1, FPGA_TO_HOST, 0);
> > +	if (ret)
> > +		return ret;
> > +
> > +	num_pending = chunk - n + 1;
> > +	if (num_pending == HALF_DMA_BUF) {
> > +		ret = issue_magic(ctx);
> > +		if (ret) {
> > +			AFU_MF_PMD_DEBUG("Magic issue failed");
> > +			return ret;
> > +		}
> > +		*wf_issued = 1;
> > +	}
> > +
> > +	if ((num_pending > (NUM_DMA_BUF - 1)) || is_last_chunk) {
> > +		if (*wf_issued) {
> > +			wait_magic(ctx);
> > +			for (i = 0; i < HALF_DMA_BUF; i++) {
> > +				rte_memcpy((void *)(uintptr_t)(dst +
> > +						n * ctx->dma_buf_size),
> > +					ctx->dma_buf[n % NUM_DMA_BUF],
> > +					ctx->dma_buf_size);
> > +				n++;
> > +			}
> > +			*wf_issued = 0;
> > +			*rx_count = n;
> > +		}
> > +		ret = issue_magic(ctx);
> > +		if (ret) {
> > +			AFU_MF_PMD_DEBUG("Magic issue failed");
> > +			return ret;
> > +		}
> > +		*wf_issued = 1;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static int dma_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t dst,
> uint64_t
> > src,
> > +	size_t count)
> > +{
> > +	uint64_t i = 0;
> > +	uint64_t count_left = count;
> > +	uint64_t aligned_addr = 0;
> > +	uint64_t align_bytes = 0;
> > +	uint64_t dma_chunks = 0;
> > +	uint64_t pending_buf = 0;
> > +	uint64_t dma_rx_bytes = 0;
> > +	uint64_t offset = 0;
> > +	int wf_issued = 0;
> > +	int ret = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src,
> dst,
> > +		count);
> > +
> > +	if (!ctx)
> > +		return -EINVAL;
> > +
> > +	if (!IS_DMA_ALIGNED(src)) {
> > +		if (count_left < DMA_ALIGN_BYTES)
> > +			return ase_fpga_to_host(ctx, &src, &dst, count_left);
> > +
> > +		aligned_addr = ((src / DMA_ALIGN_BYTES) + 1)
> > +			 * DMA_ALIGN_BYTES;
> > +		align_bytes = aligned_addr - src;
> > +		ret = ase_fpga_to_host(ctx, &src, &dst, align_bytes);
> > +		if (ret)
> > +			return ret;
> > +		count_left = count_left - align_bytes;
> > +	}
> > +
> > +	if (count_left) {
> > +		dma_chunks = count_left / ctx->dma_buf_size;
> > +		offset = dma_chunks * ctx->dma_buf_size;
> > +		count_left -= offset;
> > +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> > +			" (%"PRIu64"...0x%"PRIx64")",
> > +			src, dst, dma_chunks, count_left);
> > +		for (i = 0; i < dma_chunks; i++) {
> > +			ret = dma_rx_buf(ctx, dst, src, i,
> > +				i == (dma_chunks - 1),
> > +				&pending_buf, &wf_issued);
> > +			if (ret)
> > +				return ret;
> > +		}
> > +
> > +		if (wf_issued)
> > +			wait_magic(ctx);
> > +
> > +		/* clear out final dma memcpy operations */
> > +		while (pending_buf < dma_chunks) {
> > +			/* constant size transfer; no length check required */
> > +			rte_memcpy((void *)(uintptr_t)(dst +
> > +					pending_buf * ctx->dma_buf_size),
> > +				ctx->dma_buf[pending_buf %
> > NUM_DMA_BUF],
> > +				ctx->dma_buf_size);
> > +			pending_buf++;
> > +		}
> > +
> > +		if (count_left > 0) {
> > +			i = count_left / DMA_ALIGN_BYTES;
> > +			if (i > 0) {
> > +				dma_rx_bytes = i * DMA_ALIGN_BYTES;
> > +				AFU_MF_PMD_DEBUG("left over
> 0x%"PRIx64"
> > to DMA",
> > +					dma_rx_bytes);
> > +				ret = do_dma(ctx,
> > +					DMA_HOST_ADDR(ctx-
> >dma_iova[0]),
> > +					src + offset,
> > +					dma_rx_bytes, 1, FPGA_TO_HOST, 0);
> > +				if (ret)
> > +					return ret;
> > +				ret = issue_magic(ctx);
> > +				if (ret)
> > +					return ret;
> > +				wait_magic(ctx);
> > +				rte_memcpy((void *)(uintptr_t)(dst + offset),
> > +					ctx->dma_buf[0], dma_rx_bytes);
> > +			}
> > +
> > +			count_left -= dma_rx_bytes;
> > +			if (count_left) {
> > +				AFU_MF_PMD_DEBUG("left over
> 0x%"PRIx64"
> > to ASE",
> > +					count_left);
> > +				dst += offset + dma_rx_bytes;
> > +				src += offset + dma_rx_bytes;
> > +				ret = ase_fpga_to_host(ctx, &src, &dst,
> > +							count_left);
> > +			}
> > +		}
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static int dma_fpga_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst,
> uint64_t
> > src,
> > +	size_t count)
> > +{
> > +	uint64_t i = 0;
> > +	uint64_t count_left = count;
> > +	uint64_t dma_chunks = 0;
> > +	uint64_t offset = 0;
> > +	uint32_t tx_chunks = 0;
> > +	uint64_t *tmp_buf = NULL;
> > +	int ret = 0;
> > +
> > +	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src,
> dst,
> > +		count);
> > +
> > +	if (!ctx)
> > +		return -EINVAL;
> > +
> > +	if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src)
> > +	    && IS_DMA_ALIGNED(count_left)) {
> > +		dma_chunks = count_left / ctx->dma_buf_size;
> > +		offset = dma_chunks * ctx->dma_buf_size;
> > +		count_left -= offset;
> > +		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
> > +			" (%"PRIu64"...0x%"PRIx64")",
> > +			src, dst, dma_chunks, count_left);
> > +		for (i = 0; i < dma_chunks; i++) {
> > +			ret = do_dma(ctx, dst + i * ctx->dma_buf_size,
> > +				src + i * ctx->dma_buf_size,
> > +				ctx->dma_buf_size, 0, FPGA_TO_FPGA, 0);
> > +			if (ret)
> > +				return ret;
> > +			if ((((i + 1) % NUM_DMA_BUF) == 0) ||
> > +				(i == (dma_chunks - 1))) {
> > +				ret = issue_magic(ctx);
> > +				if (ret)
> > +					return ret;
> > +				wait_magic(ctx);
> > +			}
> > +		}
> > +
> > +		if (count_left > 0) {
> > +			AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to
> > DMA", count_left);
> > +			ret = do_dma(ctx, dst + offset, src + offset,
> > +				count_left, 1, FPGA_TO_FPGA, 0);
> > +			if (ret)
> > +				return ret;
> > +			ret = issue_magic(ctx);
> > +			if (ret)
> > +				return ret;
> > +			wait_magic(ctx);
> > +		}
> > +	} else {
> > +		if ((src < dst) && (src + count_left > dst)) {
> > +			AFU_MF_PMD_ERR("Overlapping: 0x%"PRIx64
> > +				" -> 0x%"PRIx64" (0x%"PRIx64")",
> > +				src, dst, count_left);
> > +			return -EINVAL;
> > +		}
> > +		tx_chunks = count_left / ctx->dma_buf_size;
> > +		offset = tx_chunks * ctx->dma_buf_size;
> > +		count_left -= offset;
> > +		AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64
> > +			" (%u...0x%"PRIx64")",
> > +			src, dst, tx_chunks, count_left);
> > +		tmp_buf = (uint64_t *)rte_malloc(NULL, ctx->dma_buf_size,
> > +			DMA_ALIGN_BYTES);
> > +		for (i = 0; i < tx_chunks; i++) {
> > +			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
> > +				src + i * ctx->dma_buf_size,
> > +				ctx->dma_buf_size);
> > +			if (ret)
> > +				goto free_buf;
> > +			ret = dma_host_to_fpga(ctx,
> > +				dst + i * ctx->dma_buf_size,
> > +				(uint64_t)tmp_buf, ctx->dma_buf_size);
> > +			if (ret)
> > +				goto free_buf;
> > +		}
> > +
> > +		if (count_left > 0) {
> > +			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
> > +				src + offset, count_left);
> > +			if (ret)
> > +				goto free_buf;
> > +			ret = dma_host_to_fpga(ctx, dst + offset,
> > +				(uint64_t)tmp_buf, count_left);
> > +			if (ret)
> > +				goto free_buf;
> > +		}
> > +free_buf:
> > +		rte_free(tmp_buf);
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static int dma_transfer_sync(struct dma_afu_ctx *ctx, uint64_t dst,
> > +	uint64_t src, size_t count, fpga_dma_type type)
> > +{
> > +	int ret = 0;
> > +
> > +	if (!ctx)
> > +		return -EINVAL;
> > +
> > +	if (type == HOST_TO_FPGA)
> > +		ret = dma_host_to_fpga(ctx, dst, src, count);
> > +	else if (type == FPGA_TO_HOST)
> > +		ret = dma_fpga_to_host(ctx, dst, src, count);
> > +	else if (type == FPGA_TO_FPGA)
> > +		ret = dma_fpga_to_fpga(ctx, dst, src, count);
> > +	else
> > +		return -EINVAL;
> > +
> > +	return ret;
> > +}
> > +
> > +static double getTime(struct timespec start, struct timespec end)
> > +{
> > +	uint64_t diff = 1000000000L * (end.tv_sec - start.tv_sec)
> > +		+ end.tv_nsec - start.tv_nsec;
> > +	return (double)diff / (double)1000000000L;
> > +}
> > +
> > +#define SWEEP_ITERS 1
> > +static int sweep_test(struct dma_afu_ctx *ctx, uint32_t length,
> > +	uint64_t ddr_offset, uint64_t buf_offset, uint64_t size_decrement)
> > +{
> > +	struct timespec start, end;
> > +	uint64_t test_size = 0;
> > +	uint64_t *dma_buf_ptr = NULL;
> > +	double throughput, total_time = 0.0;
> > +	int i = 0;
> > +	int ret = 0;
> > +
> > +	if (!ctx || !ctx->data_buf || !ctx->ref_buf) {
> > +		AFU_MF_PMD_ERR("Buffer for DMA test is not allocated");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (length < (buf_offset + size_decrement)) {
> > +		AFU_MF_PMD_ERR("Test length does not match unaligned
> > parameter");
> > +		return -EINVAL;
> > +	}
> > +	test_size = length - (buf_offset + size_decrement);
> > +	if ((ddr_offset + test_size) > ctx->mem_size) {
> > +		AFU_MF_PMD_ERR("Test is out of DDR memory space");
> > +		return -EINVAL;
> > +	}
> > +
> > +	dma_buf_ptr = (uint64_t *)((uint8_t *)ctx->data_buf + buf_offset);
> > +	printf("Sweep Host %p to FPGA 0x%"PRIx64
> > +		" with 0x%"PRIx64" bytes ...\n",
> > +		(void *)dma_buf_ptr, ddr_offset, test_size);
> > +
> > +	for (i = 0; i < SWEEP_ITERS; i++) {
> > +		clock_gettime(CLOCK_MONOTONIC, &start);
> > +		ret = dma_transfer_sync(ctx, ddr_offset,
> > (uint64_t)dma_buf_ptr,
> > +			test_size, HOST_TO_FPGA);
> > +		clock_gettime(CLOCK_MONOTONIC, &end);
> > +		if (ret) {
> > +			AFU_MF_PMD_ERR("Failed");
> > +			return ret;
> > +		}
> > +		total_time += getTime(start, end);
> > +	}
> > +	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
> > +	printf("Measured bandwidth = %lf MB/s\n", throughput);
> > +
> > +	printf("Sweep FPGA 0x%"PRIx64" to Host %p with 0x%"PRIx64" bytes
> > ...\n",
> > +		ddr_offset, (void *)dma_buf_ptr, test_size);
> > +
> > +	total_time = 0.0;
> > +	memset((char *)dma_buf_ptr, 0, test_size);
> > +	for (i = 0; i < SWEEP_ITERS; i++) {
> > +		clock_gettime(CLOCK_MONOTONIC, &start);
> > +		ret = dma_transfer_sync(ctx, (uint64_t)dma_buf_ptr,
> > ddr_offset,
> > +			test_size, FPGA_TO_HOST);
> > +		clock_gettime(CLOCK_MONOTONIC, &end);
> > +		if (ret) {
> > +			AFU_MF_PMD_ERR("Failed");
> > +			return ret;
> > +		}
> > +		total_time += getTime(start, end);
> > +	}
> > +	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
> > +	printf("Measured bandwidth = %lf MB/s\n", throughput);
> > +
> > +	printf("Verifying buffer ...\n");
> > +	return dma_afu_buf_verify(ctx, test_size);
> > +}
> > +
> > +static int dma_afu_test(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct dma_afu_ctx *ctx = NULL;
> > +	struct rte_pmd_afu_dma_cfg *cfg = NULL;
> > +	msgdma_ctrl ctrl;
> > +	uint64_t offset = 0;
> > +	uint32_t i = 0;
> > +	int ret = 0;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	if (!dev->priv)
> > +		return -ENOENT;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	cfg = &priv->dma_cfg;
> > +	if (cfg->index >= NUM_N3000_DMA)
> > +		return -EINVAL;
> > +	ctx = &priv->dma_ctx[cfg->index];
> > +
> > +	ctx->pattern = (int)cfg->pattern;
> > +	ctx->verbose = (int)cfg->verbose;
> > +	ctx->dma_buf_size = cfg->size;
> > +
> > +	ret = dma_afu_buf_alloc(ctx, cfg);
> > +	if (ret)
> > +		goto free;
> > +
> > +	printf("Initialize test buffer\n");
> > +	dma_afu_buf_init(ctx, cfg->length);
> > +
> > +	/* enable interrupt */
> > +	ctrl.csr = 0;
> > +	ctrl.global_intr_en_mask = 1;
> > +	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
> > +
> > +	printf("Host %p to FPGA 0x%x with 0x%x bytes\n", ctx->data_buf,
> > +		cfg->offset, cfg->length);
> > +	ret = dma_transfer_sync(ctx, cfg->offset, (uint64_t)ctx->data_buf,
> > +		cfg->length, HOST_TO_FPGA);
> > +	if (ret) {
> > +		AFU_MF_PMD_ERR("Failed to transfer data from host to
> > FPGA");
> > +		goto end;
> > +	}
> > +	memset(ctx->data_buf, 0, cfg->length);
> > +
> > +	printf("FPGA 0x%x to Host %p with 0x%x bytes\n", cfg->offset,
> > +		ctx->data_buf, cfg->length);
> > +	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, cfg->offset,
> > +		cfg->length, FPGA_TO_HOST);
> > +	if (ret) {
> > +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> > host");
> > +		goto end;
> > +	}
> > +	ret = dma_afu_buf_verify(ctx, cfg->length);
> > +	if (ret)
> > +		goto end;
> > +
> > +	if ((cfg->offset + cfg->length * 2) <= ctx->mem_size)
> > +		offset = cfg->offset + cfg->length;
> > +	else if (cfg->offset > cfg->length)
> > +		offset = 0;
> > +	else
> > +		goto end;
> > +
> > +	printf("FPGA 0x%x to FPGA 0x%"PRIx64" with 0x%x bytes\n",
> > +		cfg->offset, offset, cfg->length);
> > +	ret = dma_transfer_sync(ctx, offset, cfg->offset, cfg->length,
> > +		FPGA_TO_FPGA);
> > +	if (ret) {
> > +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> > FPGA");
> > +		goto end;
> > +	}
> > +
> > +	printf("FPGA 0x%"PRIx64" to Host %p with 0x%x bytes\n", offset,
> > +		ctx->data_buf, cfg->length);
> > +	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, offset,
> > +		cfg->length, FPGA_TO_HOST);
> > +	if (ret) {
> > +		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to
> > host");
> > +		goto end;
> > +	}
> > +	ret = dma_afu_buf_verify(ctx, cfg->length);
> > +	if (ret)
> > +		goto end;
> > +
> > +	printf("Sweep with aligned address and size\n");
> > +	ret = sweep_test(ctx, cfg->length, cfg->offset, 0, 0);
> > +	if (ret)
> > +		goto end;
> > +
> > +	if (cfg->unaligned) {
> > +		printf("Sweep with unaligned address and size\n");
> > +		struct unaligned_set {
> > +			uint64_t addr_offset;
> > +			uint64_t size_dec;
> > +		} param[] = {{61, 5}, {3, 0}, {7, 3}, {0, 3}, {0, 61}, {0, 7}};
> > +		for (i = 0; i < ARRAY_SIZE(param); i++) {
> > +			ret = sweep_test(ctx, cfg->length, cfg->offset,
> > +				param[i].addr_offset, param[i].size_dec);
> > +			if (ret)
> > +				break;
> > +		}
> > +	}
> > +
> > +end:
> > +	/* disable interrupt */
> > +	ctrl.global_intr_en_mask = 0;
> > +	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
> > +
> > +free:
> > +	dma_afu_buf_free(ctx);
> > +	return ret;
> > +}
> > +
> > +static struct rte_pci_device *n3000_afu_get_pci_dev(struct
> afu_mf_rawdev
> > *dev)
> > +{
> > +	struct rte_afu_device *afudev = NULL;
> > +
> > +	if (!dev || !dev->rawdev || !dev->rawdev->device)
> > +		return NULL;
> > +
> > +	afudev = RTE_DEV_TO_AFU(dev->rawdev->device);
> > +	if (!afudev->rawdev || !afudev->rawdev->device)
> > +		return NULL;
> > +
> > +	return RTE_DEV_TO_PCI(afudev->rawdev->device);
> > +}
> > +
> > +#ifdef VFIO_PRESENT
> > +static int dma_afu_set_irqs(struct afu_mf_rawdev *dev, uint32_t
> vec_start,
> > +	uint32_t count, int *efds)
> > +{
> > +	struct rte_pci_device *pci_dev = NULL;
> > +	struct vfio_irq_set *irq_set = NULL;
> > +	int vfio_dev_fd = 0;
> > +	size_t sz = 0;
> > +	int ret = 0;
> > +
> > +	if (!dev || !efds || (count == 0) || (count > MAX_MSIX_VEC))
> > +		return -EINVAL;
> > +
> > +	pci_dev = n3000_afu_get_pci_dev(dev);
> > +	if (!pci_dev)
> > +		return -ENODEV;
> > +	vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle);
> > +
> > +	sz = sizeof(*irq_set) + sizeof(*efds) * count;
> > +	irq_set = rte_zmalloc(NULL, sz, 0);
> > +	if (!irq_set)
> > +		return -ENOMEM;
> > +
> > +	irq_set->argsz = (uint32_t)sz;
> > +	irq_set->count = count;
> > +	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
> > +		VFIO_IRQ_SET_ACTION_TRIGGER;
> > +	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
> > +	irq_set->start = vec_start;
> > +
> > +	rte_memcpy(&irq_set->data, efds, sizeof(*efds) * count);
> > +	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
> > +	if (ret)
> > +		AFU_MF_PMD_ERR("Error enabling MSI-X interrupts\n");
> > +
> > +	rte_free(irq_set);
> > +	return ret;
> > +}
> > +#endif
> > +
> > +static void *n3000_afu_get_port_addr(struct afu_mf_rawdev *dev)
> > +{
> > +	struct rte_pci_device *pci_dev = NULL;
> > +	uint8_t *addr = NULL;
> > +	uint64_t val = 0;
> > +	uint32_t bar = 0;
> > +
> > +	pci_dev = n3000_afu_get_pci_dev(dev);
> > +	if (!pci_dev)
> > +		return NULL;
> > +
> > +	addr = (uint8_t *)pci_dev->mem_resource[0].addr;
> > +	val = rte_read64(addr + PORT_ATTR_REG(dev->port));
> > +	if (!PORT_IMPLEMENTED(val)) {
> > +		AFU_MF_PMD_INFO("FIU port %d is not implemented",
> dev-
> > >port);
> > +		return NULL;
> > +	}
> > +
> > +	bar = PORT_BAR(val);
> > +	if (bar >= PCI_MAX_RESOURCE) {
> > +		AFU_MF_PMD_ERR("BAR index %u is out of limit", bar);
> > +		return NULL;
> > +	}
> > +
> > +	addr = (uint8_t *)pci_dev->mem_resource[bar].addr +
> > PORT_OFFSET(val);
> > +	return addr;
> > +}
> > +
> > +static int n3000_afu_get_irq_capability(struct afu_mf_rawdev *dev,
> > +	uint32_t *vec_start, uint32_t *vec_count)
> > +{
> > +	uint8_t *addr = NULL;
> > +	uint64_t val = 0;
> > +	uint64_t header = 0;
> > +	uint64_t next_offset = 0;
> > +
> > +	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
> > +	if (!addr)
> > +		return -ENOENT;
> > +
> > +	do {
> > +		addr += next_offset;
> > +		header = rte_read64(addr);
> > +		if ((DFH_TYPE(header) == DFH_TYPE_PRIVATE) &&
> > +			(DFH_FEATURE_ID(header) ==
> > PORT_FEATURE_UINT_ID)) {
> > +			val = rte_read64(addr + PORT_UINT_CAP_REG);
> > +			if (vec_start)
> > +				*vec_start = PORT_VEC_START(val);
> > +			if (vec_count)
> > +				*vec_count = PORT_VEC_COUNT(val);
> > +			return 0;
> > +		}
> > +		next_offset = DFH_NEXT_OFFSET(header);
> > +		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
> > +			break;
> > +	} while (!DFH_EOL(header));
> > +
> > +	return -ENOENT;
> > +}
> > +
> > +static int nlb_afu_ctx_release(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct nlb_afu_ctx *ctx = NULL;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	ctx = &priv->nlb_ctx;
> > +
> > +	rte_free(ctx->dsm_ptr);
> > +	ctx->dsm_ptr = NULL;
> > +	ctx->status_ptr = NULL;
> > +
> > +	rte_free(ctx->src_ptr);
> > +	ctx->src_ptr = NULL;
> > +
> > +	rte_free(ctx->dest_ptr);
> > +	ctx->dest_ptr = NULL;
> > +
> > +	return 0;
> > +}
> > +
> > +static int nlb_afu_ctx_init(struct afu_mf_rawdev *dev, uint8_t *addr)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct nlb_afu_ctx *ctx = NULL;
> > +	int ret = 0;
> > +
> > +	if (!dev || !addr)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	ctx = &priv->nlb_ctx;
> > +	ctx->addr = addr;
> > +
> > +	ctx->dsm_ptr = (uint8_t *)rte_zmalloc(NULL, DSM_SIZE,
> > TEST_MEM_ALIGN);
> > +	if (!ctx->dsm_ptr) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +	ctx->dsm_iova = rte_malloc_virt2iova(ctx->dsm_ptr);
> > +	if (ctx->dsm_iova == RTE_BAD_IOVA) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +
> > +	ctx->src_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
> > +		TEST_MEM_ALIGN);
> > +	if (!ctx->src_ptr) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +	ctx->src_iova = rte_malloc_virt2iova(ctx->src_ptr);
> > +	if (ctx->src_iova == RTE_BAD_IOVA) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +
> > +	ctx->dest_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
> > +		TEST_MEM_ALIGN);
> > +	if (!ctx->dest_ptr) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> 
> Suppose that If ctx->dest_ptr fail, the nlb_afu_ctx_release() will work
> correct?
> 
> > +	ctx->dest_iova = rte_malloc_virt2iova(ctx->dest_ptr);
> > +	if (ctx->dest_iova == RTE_BAD_IOVA) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +
> > +	ctx->status_ptr = (struct nlb_dsm_status *)(ctx->dsm_ptr +
> > DSM_STATUS);
> > +	return 0;
> > +
> > +release:
> > +	nlb_afu_ctx_release(dev);
> > +	return ret;
> > +}
> > +
> > +static int dma_afu_ctx_release(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct dma_afu_ctx *ctx = NULL;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	ctx = &priv->dma_ctx[0];
> > +
> > +	rte_free(ctx->desc_buf);
> > +	ctx->desc_buf = NULL;
> > +
> > +	rte_free(ctx->magic_buf);
> > +	ctx->magic_buf = NULL;
> > +
> > +	close(ctx->event_fd);
> > +	return 0;
> > +}
> > +
> > +static int dma_afu_ctx_init(struct afu_mf_rawdev *dev, int index, uint8_t
> > *addr)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct dma_afu_ctx *ctx = NULL;
> > +	uint64_t mem_sz[] = {0x100000000, 0x100000000, 0x40000000,
> > 0x1000000};
> > +	static int efds[1] = {0};
> > +	uint32_t vec_start = 0;
> > +	int ret = 0;
> > +
> > +	if (!dev || (index < 0) || (index >= NUM_N3000_DMA) || !addr)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	ctx = &priv->dma_ctx[index];
> > +	ctx->index = index;
> > +	ctx->addr = addr;
> > +	ctx->csr_addr = addr + DMA_CSR;
> > +	ctx->desc_addr = addr + DMA_DESC;
> > +	ctx->ase_ctrl_addr = addr + DMA_ASE_CTRL;
> > +	ctx->ase_data_addr = addr + DMA_ASE_DATA;
> > +	ctx->mem_size = mem_sz[ctx->index];
> > +	ctx->cur_ase_page = INVALID_ASE_PAGE;
> > +	if (ctx->index == 0) {
> > +		ret = n3000_afu_get_irq_capability(dev, &vec_start, NULL);
> > +		if (ret)
> > +			return ret;
> > +
> > +		efds[0] = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > +		if (efds[0] < 0) {
> > +			AFU_MF_PMD_ERR("eventfd create failed");
> > +			return -EBADF;
> > +		}
> > +#ifdef VFIO_PRESENT
> > +		if (dma_afu_set_irqs(dev, vec_start, 1, efds))
> > +			AFU_MF_PMD_ERR("DMA interrupt setup failed");
> > +#endif
> > +	}
> > +	ctx->event_fd = efds[0];
> > +
> > +	ctx->desc_buf = (msgdma_ext_desc *)rte_zmalloc(NULL,
> > +		sizeof(msgdma_ext_desc), DMA_ALIGN_BYTES);
> > +	if (!ctx->desc_buf) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +
> > +	ctx->magic_buf = (uint64_t *)rte_zmalloc(NULL, MAGIC_BUF_SIZE,
> > +		TEST_MEM_ALIGN);
> > +	if (!ctx->magic_buf) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> 
> Suppose that If ctx->magic_buf fail, the dma_afu_ctx_release () will work
> correct?
> 
> > +	ctx->magic_iova = rte_malloc_virt2iova(ctx->magic_buf);
> > +	if (ctx->magic_iova == RTE_BAD_IOVA) {
> > +		ret = -ENOMEM;
> > +		goto release;
> > +	}
> > +
> > +	return 0;
> > +
> > +release:
> > +	dma_afu_ctx_release(dev);
> > +	return ret;
> > +}
> > +
> > +static int n3000_afu_ctx_init(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	uint8_t *addr = NULL;
> > +	uint64_t header = 0;
> > +	uint64_t uuid_hi = 0;
> > +	uint64_t uuid_lo = 0;
> > +	uint64_t next_offset = 0;
> > +	int ret = 0;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	addr = (uint8_t *)dev->addr;
> > +	do {
> > +		addr += next_offset;
> > +		header = rte_read64(addr);
> > +		uuid_lo = rte_read64(addr + DFH_UUID_L_OFFSET);
> > +		uuid_hi = rte_read64(addr + DFH_UUID_H_OFFSET);
> > +
> > +		if ((DFH_TYPE(header) == DFH_TYPE_AFU) &&
> > +			(uuid_lo == N3000_NLB0_UUID_L) &&
> > +			(uuid_hi == N3000_NLB0_UUID_H)) {
> > +			AFU_MF_PMD_INFO("AFU NLB0 found @ %p", (void
> > *)addr);
> > +			ret = nlb_afu_ctx_init(dev, addr);
> > +			if (ret)
> > +				return ret;
> > +		} else if ((DFH_TYPE(header) == DFH_TYPE_BBB) &&
> > +			(uuid_lo == N3000_DMA_UUID_L) &&
> > +			(uuid_hi == N3000_DMA_UUID_H) &&
> > +			(priv->num_dma < NUM_N3000_DMA)) {
> > +			AFU_MF_PMD_INFO("AFU DMA%d found @ %p",
> > +				priv->num_dma, (void *)addr);
> > +			ret = dma_afu_ctx_init(dev, priv->num_dma, addr);
> > +			if (ret)
> > +				return ret;
> > +			priv->num_dma++;
> > +		} else {
> > +			AFU_MF_PMD_DEBUG("DFH: type %"PRIu64
> > +				", uuid %016"PRIx64"%016"PRIx64,
> > +				DFH_TYPE(header), uuid_hi, uuid_lo);
> > +		}
> > +
> > +		next_offset = DFH_NEXT_OFFSET(header);
> > +		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
> > +			break;
> > +	} while (!DFH_EOL(header));
> > +
> > +	return 0;
> > +}
> > +
> > +static int n3000_afu_init(struct afu_mf_rawdev *dev)
> > +{
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	if (!dev->priv) {
> > +		dev->priv = rte_zmalloc(NULL, sizeof(struct n3000_afu_priv),
> 0);
> > +		if (!dev->priv)
> > +			return -ENOMEM;
> > +	}
> > +
> > +	return n3000_afu_ctx_init(dev);
> > +}
> > +
> > +static int n3000_afu_config(struct afu_mf_rawdev *dev, void *config,
> > +	size_t config_size)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	struct rte_pmd_afu_n3000_cfg *cfg = NULL;
> > +	int i = 0;
> > +	uint64_t top = 0;
> > +
> > +	if (!dev || !config || !config_size)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	if (config_size != sizeof(struct rte_pmd_afu_n3000_cfg))
> > +		return -EINVAL;
> > +
> > +	cfg = (struct rte_pmd_afu_n3000_cfg *)config;
> > +	if (cfg->type == RTE_PMD_AFU_N3000_NLB) {
> > +		if (cfg->nlb_cfg.mode != NLB_MODE_LPBK)
> > +			return -EINVAL;
> > +		if ((cfg->nlb_cfg.read_vc > NLB_VC_RANDOM) ||
> > +			(cfg->nlb_cfg.write_vc > NLB_VC_RANDOM))
> > +			return -EINVAL;
> > +		if (cfg->nlb_cfg.wrfence_vc > NLB_VC_VH1)
> > +			return -EINVAL;
> > +		if (cfg->nlb_cfg.cache_hint > NLB_RDLINE_MIXED)
> > +			return -EINVAL;
> > +		if (cfg->nlb_cfg.cache_policy > NLB_WRPUSH_I)
> > +			return -EINVAL;
> > +		if ((cfg->nlb_cfg.multi_cl != 1) &&
> > +			(cfg->nlb_cfg.multi_cl != 2) &&
> > +			(cfg->nlb_cfg.multi_cl != 4))
> > +			return -EINVAL;
> > +		if ((cfg->nlb_cfg.begin < MIN_CACHE_LINES) ||
> > +			(cfg->nlb_cfg.begin > MAX_CACHE_LINES))
> > +			return -EINVAL;
> > +		if ((cfg->nlb_cfg.end < cfg->nlb_cfg.begin) ||
> > +			(cfg->nlb_cfg.end > MAX_CACHE_LINES))
> > +			return -EINVAL;
> > +		rte_memcpy(&priv->nlb_cfg, &cfg->nlb_cfg,
> > +			sizeof(struct rte_pmd_afu_nlb_cfg));
> > +	} else if (cfg->type == RTE_PMD_AFU_N3000_DMA) {
> > +		if (cfg->dma_cfg.index >= NUM_N3000_DMA)
> > +			return -EINVAL;
> > +		i = cfg->dma_cfg.index;
> > +		if (cfg->dma_cfg.length > priv->dma_ctx[i].mem_size)
> > +			return -EINVAL;
> > +		if (cfg->dma_cfg.offset >= priv->dma_ctx[i].mem_size)
> > +			return -EINVAL;
> > +		top = cfg->dma_cfg.length + cfg->dma_cfg.offset;
> > +		if ((top == 0) || (top > priv->dma_ctx[i].mem_size))
> > +			return -EINVAL;
> > +		if (i == 3) {  /* QDR connected to DMA3 */
> > +			if (cfg->dma_cfg.length & 0x3f) {
> > +				cfg->dma_cfg.length &= ~0x3f;
> > +				AFU_MF_PMD_INFO("Round size to %x for
> > QDR",
> > +					cfg->dma_cfg.length);
> > +			}
> > +		}
> > +		rte_memcpy(&priv->dma_cfg, &cfg->dma_cfg,
> > +			sizeof(struct rte_pmd_afu_dma_cfg));
> > +	} else {
> > +		AFU_MF_PMD_ERR("Invalid type of N3000 AFU");
> > +		return -EINVAL;
> > +	}
> > +
> > +	priv->cfg_type = cfg->type;
> > +	return 0;
> > +}
> > +
> > +static int n3000_afu_test(struct afu_mf_rawdev *dev)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +	int ret = 0;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	if (!dev->priv)
> > +		return -ENOENT;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +
> > +	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
> > +		AFU_MF_PMD_INFO("Test NLB");
> > +		ret = nlb_afu_test(dev);
> > +	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
> > +		AFU_MF_PMD_INFO("Test DMA%u", priv->dma_cfg.index);
> > +		ret = dma_afu_test(dev);
> > +	} else {
> > +		AFU_MF_PMD_ERR("Please configure AFU before test");
> > +		ret = -EINVAL;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static int n3000_afu_close(struct afu_mf_rawdev *dev)
> > +{
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	nlb_afu_ctx_release(dev);
> > +	dma_afu_ctx_release(dev);
> > +
> > +	rte_free(dev->priv);
> > +	dev->priv = NULL;
> > +
> > +	return 0;
> > +}
> > +
> > +static int n3000_afu_dump(struct afu_mf_rawdev *dev, FILE *f)
> > +{
> > +	struct n3000_afu_priv *priv = NULL;
> > +
> > +	if (!dev)
> > +		return -EINVAL;
> > +
> > +	priv = (struct n3000_afu_priv *)dev->priv;
> > +	if (!priv)
> > +		return -ENOENT;
> > +
> > +	if (!f)
> > +		f = stdout;
> > +
> > +	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
> > +		struct nlb_afu_ctx *ctx = &priv->nlb_ctx;
> > +		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
> > +		fprintf(f, "dsm_ptr:\t%p\n", (void *)ctx->dsm_ptr);
> > +		fprintf(f, "dsm_iova:\t0x%"PRIx64"\n", ctx->dsm_iova);
> > +		fprintf(f, "src_ptr:\t%p\n", (void *)ctx->src_ptr);
> > +		fprintf(f, "src_iova:\t0x%"PRIx64"\n", ctx->src_iova);
> > +		fprintf(f, "dest_ptr:\t%p\n", (void *)ctx->dest_ptr);
> > +		fprintf(f, "dest_iova:\t0x%"PRIx64"\n", ctx->dest_iova);
> > +		fprintf(f, "status_ptr:\t%p\n", (void *)ctx->status_ptr);
> > +	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
> > +		struct dma_afu_ctx *ctx = &priv->dma_ctx[priv-
> > >dma_cfg.index];
> > +		fprintf(f, "index:\t\t%d\n", ctx->index);
> > +		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
> > +		fprintf(f, "csr_addr:\t%p\n", (void *)ctx->csr_addr);
> > +		fprintf(f, "desc_addr:\t%p\n", (void *)ctx->desc_addr);
> > +		fprintf(f, "ase_ctrl_addr:\t%p\n", (void *)ctx->ase_ctrl_addr);
> > +		fprintf(f, "ase_data_addr:\t%p\n", (void *)ctx-
> >ase_data_addr);
> > +		fprintf(f, "desc_buf:\t%p\n", (void *)ctx->desc_buf);
> > +		fprintf(f, "magic_buf:\t%p\n", (void *)ctx->magic_buf);
> > +		fprintf(f, "magic_iova:\t0x%"PRIx64"\n", ctx->magic_iova);
> > +	} else {
> > +		return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int n3000_afu_reset(struct afu_mf_rawdev *dev)
> > +{
> > +	uint8_t *addr = NULL;
> > +	uint64_t val = 0;
> > +
> > +	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
> > +	if (!addr)
> > +		return -ENOENT;
> > +
> > +	val = rte_read64(addr + PORT_CTRL_REG);
> > +	val |= PORT_SOFT_RESET;
> > +	rte_write64(val, addr + PORT_CTRL_REG);
> > +	rte_delay_us(100);
> > +	val &= ~PORT_SOFT_RESET;
> > +	rte_write64(val, addr + PORT_CTRL_REG);
> > +
> > +	return 0;
> > +}
> > +
> > +static struct afu_mf_ops n3000_afu_ops = {
> > +	.init = n3000_afu_init,
> > +	.config = n3000_afu_config,
> > +	.start = NULL,
> > +	.stop = NULL,
> > +	.test = n3000_afu_test,
> > +	.close = n3000_afu_close,
> > +	.dump = n3000_afu_dump,
> > +	.reset = n3000_afu_reset
> > +};
> > +
> > +struct afu_mf_drv n3000_afu_drv = {
> > +	.uuid = { N3000_AFU_UUID_L, N3000_AFU_UUID_H },
> > +	.ops = &n3000_afu_ops
> > +};
> > diff --git a/drivers/raw/afu_mf/n3000_afu.h
> b/drivers/raw/afu_mf/n3000_afu.h
> > new file mode 100644
> > index 0000000..4c740da
> > --- /dev/null
> > +++ b/drivers/raw/afu_mf/n3000_afu.h
> > @@ -0,0 +1,333 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation
> > + */
> > +
> > +#ifndef _N3000_AFU_H_
> > +#define _N3000_AFU_H_
> > +
> > +#include "afu_mf_rawdev.h"
> > +#include "rte_pmd_afu.h"
> > +
> > +#define N3000_AFU_UUID_L  0xc000c9660d824272
> > +#define N3000_AFU_UUID_H  0x9aeffe5f84570612
> > +#define N3000_NLB0_UUID_L 0xf89e433683f9040b
> > +#define N3000_NLB0_UUID_H 0xd8424dc4a4a3c413
> > +#define N3000_DMA_UUID_L  0xa9149a35bace01ea
> > +#define N3000_DMA_UUID_H  0xef82def7f6ec40fc
> > +
> > +extern struct afu_mf_drv n3000_afu_drv;
> > +
> > +#define NUM_N3000_DMA  4
> > +#define MAX_MSIX_VEC   7
> > +
> > +/* N3000 DFL definition */
> > +#define DFH_UUID_L_OFFSET  8
> > +#define DFH_UUID_H_OFFSET  16
> > +#define DFH_TYPE(hdr)  (((hdr) >> 60) & 0xf)
> > +#define DFH_TYPE_AFU  1
> > +#define DFH_TYPE_BBB  2
> > +#define DFH_TYPE_PRIVATE  3
> > +#define DFH_EOL(hdr)  (((hdr) >> 40) & 0x1)
> > +#define DFH_NEXT_OFFSET(hdr)  (((hdr) >> 16) & 0xffffff)
> > +#define DFH_FEATURE_ID(hdr)  ((hdr) & 0xfff)
> > +#define PORT_ATTR_REG(n)  (((n) << 3) + 0x38)
> > +#define PORT_IMPLEMENTED(attr)  (((attr) >> 60) & 0x1)
> > +#define PORT_BAR(attr)  (((attr) >> 32) & 0x7)
> > +#define PORT_OFFSET(attr)  ((attr) & 0xffffff)
> > +#define PORT_FEATURE_UINT_ID  0x12
> > +#define PORT_UINT_CAP_REG  0x8
> > +#define PORT_VEC_START(cap)  (((cap) >> 12) & 0xfff)
> > +#define PORT_VEC_COUNT(cap)  ((cap) >> 12 & 0xfff)
> > +#define PORT_CTRL_REG  0x38
> > +#define PORT_SOFT_RESET  (0x1 << 0)
> > +
> > +/* NLB registers definition */
> > +#define CSR_SCRATCHPAD0    0x100
> > +#define CSR_SCRATCHPAD1    0x108
> > +#define CSR_AFU_DSM_BASEL  0x110
> > +#define CSR_AFU_DSM_BASEH  0x114
> > +#define CSR_SRC_ADDR       0x120
> > +#define CSR_DST_ADDR       0x128
> > +#define CSR_NUM_LINES      0x130
> > +#define CSR_CTL            0x138
> > +#define CSR_CFG            0x140
> > +#define CSR_INACT_THRESH   0x148
> > +#define CSR_INTERRUPT0     0x150
> > +#define CSR_SWTEST_MSG     0x158
> > +#define CSR_STATUS0        0x160
> > +#define CSR_STATUS1        0x168
> > +#define CSR_ERROR          0x170
> > +#define CSR_STRIDE         0x178
> > +#define CSR_HE_INFO0       0x180
> > +
> > +#define DSM_SIZE           0x200000
> > +#define DSM_STATUS         0x40
> > +#define DSM_POLL_INTERVAL  5  /* ms */
> > +#define DSM_TIMEOUT        1000  /* ms */
> > +
> > +#define NLB_BUF_SIZE  0x400000
> > +#define TEST_MEM_ALIGN  1024
> > +
> > +struct nlb_csr_ctl {
> > +	union {
> > +		uint32_t csr;
> > +		struct {
> > +			uint32_t reset:1;
> > +			uint32_t start:1;
> > +			uint32_t force_completion:1;
> > +			uint32_t reserved:29;
> > +		};
> > +	};
> > +};
> > +
> > +struct nlb_csr_cfg {
> > +	union {
> > +		uint32_t csr;
> > +		struct {
> > +			uint32_t wrthru_en:1;
> > +			uint32_t cont:1;
> > +			uint32_t mode:3;
> > +			uint32_t multicl_len:2;
> > +			uint32_t rsvd1:1;
> > +			uint32_t delay_en:1;
> > +			uint32_t rdsel:2;
> > +			uint32_t rsvd2:1;
> > +			uint32_t chsel:3;
> > +			uint32_t rsvd3:1;
> > +			uint32_t wrpush_i:1;
> > +			uint32_t wr_chsel:3;
> > +			uint32_t rsvd4:3;
> > +			uint32_t test_cfg:5;
> > +			uint32_t interrupt_on_error:1;
> > +			uint32_t interrupt_testmode:1;
> > +			uint32_t wrfence_chsel:2;
> > +		};
> > +	};
> > +};
> > +
> > +struct nlb_status0 {
> > +	union {
> > +		uint64_t csr;
> > +		struct {
> > +			uint32_t num_writes;
> > +			uint32_t num_reads;
> > +		};
> > +	};
> > +};
> > +
> > +struct nlb_status1 {
> > +	union {
> > +		uint64_t csr;
> > +		struct {
> > +			uint32_t num_pend_writes;
> > +			uint32_t num_pend_reads;
> > +		};
> > +	};
> > +};
> > +
> > +struct nlb_dsm_status {
> > +	uint32_t test_complete;
> > +	uint32_t test_error;
> > +	uint64_t num_clocks;
> > +	uint32_t num_reads;
> > +	uint32_t num_writes;
> > +	uint32_t start_overhead;
> > +	uint32_t end_overhead;
> > +};
> > +
> > +/* DMA registers definition */
> > +#define DMA_CSR       0x40
> > +#define DMA_DESC      0x60
> > +#define DMA_ASE_CTRL  0x200
> > +#define DMA_ASE_DATA  0x1000
> > +
> > +#define DMA_ASE_WINDOW       4096
> > +#define DMA_ASE_WINDOW_MASK  ((uint64_t)(DMA_ASE_WINDOW -
> 1))
> > +#define INVALID_ASE_PAGE     0xffffffffffffffffULL
> > +
> > +#define DMA_WF_MAGIC             0x5772745F53796E63ULL
> > +#define DMA_WF_MAGIC_ROM         0x1000000000000
> > +#define DMA_HOST_ADDR(addr)      ((addr) | 0x2000000000000)
> > +#define DMA_WF_HOST_ADDR(addr)   ((addr) | 0x3000000000000)
> > +
> > +#define NUM_DMA_BUF   8
> > +#define HALF_DMA_BUF  (NUM_DMA_BUF / 2)
> > +
> > +#define DMA_MASK_32_BIT 0xFFFFFFFF
> > +
> > +#define DMA_CSR_BUSY           0x1
> > +#define DMA_DESC_BUFFER_EMPTY  0x2
> > +#define DMA_DESC_BUFFER_FULL   0x4
> > +
> > +#define DWORD_BYTES 4
> > +#define IS_ALIGNED_DWORD(addr) (((addr) % DWORD_BYTES) == 0)
> > +
> > +#define QWORD_BYTES 8
> > +#define IS_ALIGNED_QWORD(addr) (((addr) % QWORD_BYTES) == 0)
> > +
> > +#define DMA_ALIGN_BYTES 64
> > +#define IS_DMA_ALIGNED(addr) (((addr) % DMA_ALIGN_BYTES) == 0)
> > +
> > +#define CCIP_ALIGN_BYTES (DMA_ALIGN_BYTES << 2)
> > +
> > +#define DMA_TIMEOUT_MSEC  5000
> > +
> > +#define MAGIC_BUF_SIZE  64
> > +#define ERR_CHECK_LIMIT  64
> > +
> > +#ifndef MIN
> > +#define MIN(a, b) ((a) < (b) ? (a) : (b))
> > +#endif
> > +
> > +#ifndef ARRAY_SIZE
> > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> > +#endif
> > +
> > +typedef enum {
> > +	HOST_TO_FPGA = 0,
> > +	FPGA_TO_HOST,
> > +	FPGA_TO_FPGA,
> > +	FPGA_MAX_TRANSFER_TYPE,
> > +} fpga_dma_type;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t tx_channel:8;
> > +		uint32_t generate_sop:1;
> > +		uint32_t generate_eop:1;
> > +		uint32_t park_reads:1;
> > +		uint32_t park_writes:1;
> > +		uint32_t end_on_eop:1;
> > +		uint32_t reserved_1:1;
> > +		uint32_t transfer_irq_en:1;
> > +		uint32_t early_term_irq_en:1;
> > +		uint32_t trans_error_irq_en:8;
> > +		uint32_t early_done_en:1;
> > +		uint32_t reserved_2:6;
> > +		uint32_t go:1;
> > +	};
> > +} msgdma_desc_ctrl;
> > +
> > +typedef struct __rte_packed {
> > +	uint32_t rd_address;
> > +	uint32_t wr_address;
> > +	uint32_t len;
> > +	uint16_t seq_num;
> > +	uint8_t rd_burst_count;
> > +	uint8_t wr_burst_count;
> > +	uint16_t rd_stride;
> > +	uint16_t wr_stride;
> > +	uint32_t rd_address_ext;
> > +	uint32_t wr_address_ext;
> > +	msgdma_desc_ctrl control;
> > +} msgdma_ext_desc;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t busy:1;
> > +		uint32_t desc_buf_empty:1;
> > +		uint32_t desc_buf_full:1;
> > +		uint32_t rsp_buf_empty:1;
> > +		uint32_t rsp_buf_full:1;
> > +		uint32_t stopped:1;
> > +		uint32_t resetting:1;
> > +		uint32_t stopped_on_error:1;
> > +		uint32_t stopped_on_early_term:1;
> > +		uint32_t irq:1;
> > +		uint32_t reserved:22;
> > +	};
> > +} msgdma_status;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t stop_dispatcher:1;
> > +		uint32_t reset_dispatcher:1;
> > +		uint32_t stop_on_error:1;
> > +		uint32_t stopped_on_early_term:1;
> > +		uint32_t global_intr_en_mask:1;
> > +		uint32_t stop_descriptors:1;
> > +		uint32_t reserved:22;
> > +	};
> > +} msgdma_ctrl;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t rd_fill_level:16;
> > +		uint32_t wr_fill_level:16;
> > +	};
> > +} msgdma_fill_level;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t rsp_fill_level:16;
> > +		uint32_t reserved:16;
> > +	};
> > +} msgdma_rsp_level;
> > +
> > +typedef union {
> > +	uint32_t csr;
> > +	struct {
> > +		uint32_t rd_seq_num:16;
> > +		uint32_t wr_seq_num:16;
> > +	};
> > +} msgdma_seq_num;
> > +
> > +typedef struct __rte_packed {
> > +	msgdma_status status;
> > +	msgdma_ctrl ctrl;
> > +	msgdma_fill_level fill_level;
> > +	msgdma_rsp_level rsp;
> > +	msgdma_seq_num seq_num;
> > +} msgdma_csr;
> > +
> > +#define CSR_STATUS(csr)   (&(((msgdma_csr *)(csr))->status))
> > +#define CSR_CONTROL(csr)  (&(((msgdma_csr *)(csr))->ctrl))
> > +
> > +struct nlb_afu_ctx {
> > +	uint8_t *addr;
> > +	uint8_t *dsm_ptr;
> > +	uint64_t dsm_iova;
> > +	uint8_t *src_ptr;
> > +	uint64_t src_iova;
> > +	uint8_t *dest_ptr;
> > +	uint64_t dest_iova;
> > +	struct nlb_dsm_status *status_ptr;
> > +};
> > +
> > +struct dma_afu_ctx {
> > +	int index;
> > +	uint8_t *addr;
> > +	uint8_t *csr_addr;
> > +	uint8_t *desc_addr;
> > +	uint8_t *ase_ctrl_addr;
> > +	uint8_t *ase_data_addr;
> > +	uint64_t mem_size;
> > +	uint64_t cur_ase_page;
> > +	int event_fd;
> > +	int verbose;
> > +	int pattern;
> > +	void *data_buf;
> > +	void *ref_buf;
> > +	msgdma_ext_desc *desc_buf;
> > +	uint64_t *magic_buf;
> > +	uint64_t magic_iova;
> > +	uint32_t dma_buf_size;
> > +	uint64_t *dma_buf[NUM_DMA_BUF];
> > +	uint64_t dma_iova[NUM_DMA_BUF];
> > +};
> > +
> > +struct n3000_afu_priv {
> > +	struct rte_pmd_afu_nlb_cfg nlb_cfg;
> > +	struct rte_pmd_afu_dma_cfg dma_cfg;
> > +	struct nlb_afu_ctx nlb_ctx;
> > +	struct dma_afu_ctx dma_ctx[NUM_N3000_DMA];
> > +	int num_dma;
> > +	int cfg_type;
> > +};
> > +
> > +#endif /* _N3000_AFU_H_ */
> > diff --git a/drivers/raw/afu_mf/rte_pmd_afu.h
> > b/drivers/raw/afu_mf/rte_pmd_afu.h
> > new file mode 100644
> > index 0000000..f14a053
> > --- /dev/null
> > +++ b/drivers/raw/afu_mf/rte_pmd_afu.h
> > @@ -0,0 +1,97 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright 2022 Intel Corporation
> > + */
> > +
> > +#ifndef __RTE_PMD_AFU_H__
> > +#define __RTE_PMD_AFU_H__
> > +
> > +/**
> > + * @file rte_pmd_afu.h
> > + *
> > + * AFU PMD specific definitions.
> > + *
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> > notice
> > + *
> > + */
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include <stdint.h>
> > +
> > +#define RTE_PMD_AFU_N3000_NLB   1
> > +#define RTE_PMD_AFU_N3000_DMA   2
> > +
> > +#define NLB_MODE_LPBK      0
> > +#define NLB_MODE_READ      1
> > +#define NLB_MODE_WRITE     2
> > +#define NLB_MODE_TRPUT     3
> > +
> > +#define NLB_VC_AUTO        0
> > +#define NLB_VC_VL0         1
> > +#define NLB_VC_VH0         2
> > +#define NLB_VC_VH1         3
> > +#define NLB_VC_RANDOM      4
> > +
> > +#define NLB_WRLINE_M       0
> > +#define NLB_WRLINE_I       1
> > +#define NLB_WRPUSH_I       2
> > +
> > +#define NLB_RDLINE_S       0
> > +#define NLB_RDLINE_I       1
> > +#define NLB_RDLINE_MIXED   2
> > +
> > +#define MIN_CACHE_LINES   1
> > +#define MAX_CACHE_LINES   1024
> > +
> > +#define MIN_DMA_BUF_SIZE  64
> > +#define MAX_DMA_BUF_SIZE  (1023 * 1024)
> > +
> > +/**
> > + * NLB AFU configuration data structure.
> > + */
> > +struct rte_pmd_afu_nlb_cfg {
> > +	uint32_t mode;
> > +	uint32_t begin;
> > +	uint32_t end;
> > +	uint32_t multi_cl;
> > +	uint32_t cont;
> > +	uint32_t timeout;
> > +	uint32_t cache_policy;
> > +	uint32_t cache_hint;
> > +	uint32_t read_vc;
> > +	uint32_t write_vc;
> > +	uint32_t wrfence_vc;
> > +	uint32_t freq_mhz;
> > +};
> > +
> > +/**
> > + * DMA AFU configuration data structure.
> > + */
> > +struct rte_pmd_afu_dma_cfg {
> > +	uint32_t index;     /* index of DMA controller */
> > +	uint32_t length;    /* total length of data to DMA */
> > +	uint32_t offset;    /* address offset of target memory */
> > +	uint32_t size;      /* size of transfer buffer */
> > +	uint32_t pattern;   /* data pattern to fill in test buffer */
> > +	uint32_t unaligned; /* use unaligned address or length in sweep test
> */
> > +	uint32_t verbose;   /* enable verbose error information in test */
> > +};
> > +
> > +/**
> > + * N3000 AFU configuration data structure.
> > + */
> > +struct rte_pmd_afu_n3000_cfg {
> > +	int type;   /* RTE_PMD_AFU_N3000_NLB or
> > RTE_PMD_AFU_N3000_DMA */
> > +	union {
> > +		struct rte_pmd_afu_nlb_cfg nlb_cfg;
> > +		struct rte_pmd_afu_dma_cfg dma_cfg;
> > +	};
> > +};
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* __RTE_PMD_AFU_H__ */
> > --
> > 1.8.3.1
  

Patch

diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.c b/drivers/raw/afu_mf/afu_mf_rawdev.c
index 5be372a..7c18f3b 100644
--- a/drivers/raw/afu_mf/afu_mf_rawdev.c
+++ b/drivers/raw/afu_mf/afu_mf_rawdev.c
@@ -17,15 +17,19 @@ 
 #include <rte_memzone.h>
 #include <rte_rawdev_pmd.h>
 
+#include "rte_pmd_afu.h"
 #include "afu_mf_rawdev.h"
+#include "n3000_afu.h"
 
 #define AFU_MF_PMD_RAWDEV_NAME rawdev_afu_mf
 
 static const struct rte_afu_uuid afu_uuid_map[] = {
+	{ N3000_AFU_UUID_L, N3000_AFU_UUID_H },
 	{ 0, 0 /* sentinel */ }
 };
 
 static struct afu_mf_drv *afu_table[] = {
+	&n3000_afu_drv,
 	NULL
 };
 
diff --git a/drivers/raw/afu_mf/afu_mf_rawdev.h b/drivers/raw/afu_mf/afu_mf_rawdev.h
index df6715c..5a66f6c 100644
--- a/drivers/raw/afu_mf/afu_mf_rawdev.h
+++ b/drivers/raw/afu_mf/afu_mf_rawdev.h
@@ -30,6 +30,24 @@ 
 #define AFU_MF_PMD_WARN(fmt, args...) \
 	AFU_MF_PMD_LOG(WARNING, fmt, ## args)
 
+#define CLS_TO_SIZE(n)  ((n) << 6)  /* get size of n cache lines */
+#define SIZE_TO_CLS(s)  ((s) >> 6)  /* convert size to number of cache lines */
+#define MHZ(f)  ((f) * 1000000)
+
+#define dsm_poll_timeout(addr, val, cond, invl, timeout) \
+({                                                       \
+	uint64_t __wait = 0;                                 \
+	uint64_t __invl = (invl);                            \
+	uint64_t __timeout = (timeout);                      \
+	for (; __wait <= __timeout; __wait += __invl) {      \
+		(val) = *(addr);                                 \
+		if (cond)                                        \
+			break;                                       \
+		rte_delay_ms(__invl);                            \
+	}                                                    \
+	(cond) ? 0 : 1;                                      \
+})
+
 struct afu_mf_rawdev;
 
 struct afu_mf_ops {
diff --git a/drivers/raw/afu_mf/meson.build b/drivers/raw/afu_mf/meson.build
index 80526a2..8a989e3 100644
--- a/drivers/raw/afu_mf/meson.build
+++ b/drivers/raw/afu_mf/meson.build
@@ -2,4 +2,6 @@ 
 # Copyright 2022 Intel Corporation
 
 deps += ['rawdev', 'bus_pci', 'bus_ifpga']
-sources = files('afu_mf_rawdev.c')
+sources = files('afu_mf_rawdev.c', 'n3000_afu.c')
+
+headers = files('rte_pmd_afu.h')
diff --git a/drivers/raw/afu_mf/n3000_afu.c b/drivers/raw/afu_mf/n3000_afu.c
new file mode 100644
index 0000000..19d7c54
--- /dev/null
+++ b/drivers/raw/afu_mf/n3000_afu.c
@@ -0,0 +1,2005 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+
+#include <rte_eal.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_io.h>
+#include <rte_vfio.h>
+#include <rte_bus_pci.h>
+#include <rte_bus_ifpga.h>
+#include <rte_rawdev.h>
+
+#include "afu_mf_rawdev.h"
+#include "n3000_afu.h"
+
+static int nlb_afu_config(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
+	struct nlb_csr_cfg v;
+
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->priv)
+		return -ENOENT;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	cfg = &priv->nlb_cfg;
+
+	v.csr = 0;
+
+	if (cfg->cont)
+		v.cont = 1;
+
+	if (cfg->cache_policy == NLB_WRPUSH_I)
+		v.wrpush_i = 1;
+	else
+		v.wrthru_en = cfg->cache_policy;
+
+	if (cfg->cache_hint == NLB_RDLINE_MIXED)
+		v.rdsel = 3;
+	else
+		v.rdsel = cfg->cache_hint;
+
+	v.mode = cfg->mode;
+	v.chsel = cfg->read_vc;
+	v.wr_chsel = cfg->write_vc;
+	v.wrfence_chsel = cfg->wrfence_vc;
+	v.wrthru_en = cfg->cache_policy;
+	v.multicl_len = cfg->multi_cl - 1;
+
+	AFU_MF_PMD_DEBUG("cfg: 0x%08x", v.csr);
+	rte_write32(v.csr, priv->nlb_ctx.addr + CSR_CFG);
+
+	return 0;
+}
+
+static void nlb_afu_report(struct afu_mf_rawdev *dev, uint32_t cl)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
+	struct nlb_dsm_status *stat = NULL;
+	uint64_t ticks = 0;
+	double num, rd_bw, wr_bw;
+
+	if (!dev || !dev->priv)
+		return;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+
+	cfg = &priv->nlb_cfg;
+	stat = priv->nlb_ctx.status_ptr;
+
+	if (cfg->cont)
+		ticks = stat->num_clocks - stat->start_overhead;
+	else
+		ticks = stat->num_clocks -
+			(stat->start_overhead + stat->end_overhead);
+
+	if (cfg->freq_mhz == 0)
+		cfg->freq_mhz = 200;
+
+	num = (double)stat->num_reads;
+	rd_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
+	num = (double)stat->num_writes;
+	wr_bw = (num * CLS_TO_SIZE(1) * MHZ(cfg->freq_mhz)) / ticks;
+
+	printf("Cachelines  Read_Count Write_Count Clocks@%uMHz   "
+		"Rd_Bandwidth   Wr_Bandwidth\n", cfg->freq_mhz);
+	printf("%10u  %10u %11u  %12"PRIu64"   %7.3f GB/s   %7.3f GB/s\n",
+		cl, stat->num_reads, stat->num_writes, ticks,
+		rd_bw / 1e9, wr_bw / 1e9);
+}
+
+static int nlb_afu_test(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct nlb_afu_ctx *ctx = NULL;
+	struct rte_pmd_afu_nlb_cfg *cfg = NULL;
+	struct nlb_csr_ctl ctl;
+	uint32_t *ptr = NULL;
+	uint32_t i, j, cl, val = 0;
+	uint64_t sval = 0;
+	int ret = 0;
+
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->priv)
+		return -ENOENT;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	ctx = &priv->nlb_ctx;
+	cfg = &priv->nlb_cfg;
+
+	/* initialize registers */
+	AFU_MF_PMD_DEBUG("dsm_addr: 0x%"PRIx64, ctx->dsm_iova);
+	rte_write64(ctx->dsm_iova, ctx->addr + CSR_AFU_DSM_BASEL);
+
+	ctl.csr = 0;
+	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+	ctl.reset = 1;
+	rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+
+	AFU_MF_PMD_DEBUG("src_addr: 0x%"PRIx64, ctx->src_iova);
+	rte_write64(SIZE_TO_CLS(ctx->src_iova), ctx->addr + CSR_SRC_ADDR);
+	AFU_MF_PMD_DEBUG("dst_addr: 0x%"PRIx64, ctx->dest_iova);
+	rte_write64(SIZE_TO_CLS(ctx->dest_iova), ctx->addr + CSR_DST_ADDR);
+
+	ret = nlb_afu_config(dev);
+	if (ret)
+		return ret;
+
+	/* initialize src data */
+	ptr = (uint32_t *)ctx->src_ptr;
+	j = CLS_TO_SIZE(cfg->end) >> 2;
+	for (i = 0; i < j; i++)
+		*ptr++ = i;
+
+	/* start test */
+	for (cl = cfg->begin; cl <= cfg->end; cl += cfg->multi_cl) {
+		memset(ctx->dest_ptr, 0, CLS_TO_SIZE(cl));
+		memset(ctx->dsm_ptr, 0, DSM_SIZE);
+
+		ctl.csr = 0;
+		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+		ctl.reset = 1;
+		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+
+		rte_write32(cl, ctx->addr + CSR_NUM_LINES);
+
+		rte_delay_us(10);
+
+		ctl.start = 1;
+		rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+
+		if (cfg->cont) {
+			rte_delay_ms(cfg->timeout * 1000);
+			ctl.force_completion = 1;
+			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+			ret = dsm_poll_timeout(&ctx->status_ptr->test_complete,
+				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
+				DSM_TIMEOUT);
+			if (ret) {
+				printf("DSM poll timeout\n");
+				goto end;
+			}
+		} else {
+			ret = dsm_poll_timeout(&ctx->status_ptr->test_complete,
+				val, (val & 0x1) == 1, DSM_POLL_INTERVAL,
+				DSM_TIMEOUT);
+			if (ret) {
+				printf("DSM poll timeout\n");
+				goto end;
+			}
+			ctl.force_completion = 1;
+			rte_write32(ctl.csr, ctx->addr + CSR_CTL);
+		}
+
+		nlb_afu_report(dev, cl);
+
+		i = 0;
+		while (i++ < 100) {
+			sval = rte_read64(ctx->addr + CSR_STATUS1);
+			if (sval == 0)
+				break;
+			rte_delay_us(1000);
+		}
+
+		ptr = (uint32_t *)ctx->dest_ptr;
+		j = CLS_TO_SIZE(cl) >> 2;
+		for (i = 0; i < j; i++) {
+			if (*ptr++ != i) {
+				AFU_MF_PMD_ERR("Data mismatch @ %u", i);
+				break;
+			}
+		}
+	}
+
+end:
+	return ret;
+}
+
+static void dma_afu_buf_free(struct dma_afu_ctx *ctx)
+{
+	int i = 0;
+
+	if (!ctx)
+		return;
+
+	for (i = 0; i < NUM_DMA_BUF; i++) {
+		rte_free(ctx->dma_buf[i]);
+		ctx->dma_buf[i] = NULL;
+	}
+
+	rte_free(ctx->data_buf);
+	ctx->data_buf = NULL;
+
+	rte_free(ctx->ref_buf);
+	ctx->ref_buf = NULL;
+}
+
+static int dma_afu_buf_alloc(struct dma_afu_ctx *ctx,
+	struct rte_pmd_afu_dma_cfg *cfg)
+{
+	size_t page_sz = sysconf(_SC_PAGE_SIZE);
+	int i, ret = 0;
+
+	if (!ctx || !cfg)
+		return -EINVAL;
+
+	for (i = 0; i < NUM_DMA_BUF; i++) {
+		ctx->dma_buf[i] = (uint64_t *)rte_zmalloc(NULL, cfg->size,
+			TEST_MEM_ALIGN);
+		if (!ctx->dma_buf[i]) {
+			ret = -ENOMEM;
+			goto free;
+		}
+		ctx->dma_iova[i] = rte_malloc_virt2iova(ctx->dma_buf[i]);
+		if (ctx->dma_iova[i] == RTE_BAD_IOVA) {
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	ctx->data_buf = rte_malloc(NULL, cfg->length, page_sz);
+	if (!ctx->data_buf) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	ctx->ref_buf = rte_malloc(NULL, cfg->length, page_sz);
+	if (!ctx->ref_buf) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	return 0;
+
+free:
+	dma_afu_buf_free(ctx);
+	return ret;
+}
+
+static void dma_afu_buf_init(struct dma_afu_ctx *ctx, size_t size)
+{
+	int *ptr = NULL;
+	size_t i = 0;
+	size_t dword_size = 0;
+
+	if (!ctx || !size)
+		return;
+
+	ptr = (int *)ctx->ref_buf;
+
+	if (ctx->pattern) {
+		memset(ptr, ctx->pattern, size);
+	} else {
+		srand(99);
+		dword_size = size >> 2;
+		for (i = 0; i < dword_size; i++)
+			*ptr++ = rand();
+	}
+	rte_memcpy(ctx->data_buf, ctx->ref_buf, size);
+}
+
+static int dma_afu_buf_verify(struct dma_afu_ctx *ctx, size_t size)
+{
+	uint8_t *src = NULL;
+	uint8_t *dst = NULL;
+	size_t i = 0;
+	int n = 0;
+
+	if (!ctx || !size)
+		return -EINVAL;
+
+	src = (uint8_t *)ctx->ref_buf;
+	dst = (uint8_t *)ctx->data_buf;
+
+	if (memcmp(src, dst, size)) {
+		printf("Transfer is corrupted\n");
+		if (ctx->verbose) {
+			for (i = 0; i < size; i++) {
+				if (*src != *dst) {
+					if (++n >= ERR_CHECK_LIMIT)
+						break;
+					printf("Mismatch at 0x%zx, "
+						"Expected %02x  Actual %02x\n",
+						i, *src, *dst);
+				}
+				src++;
+				dst++;
+			}
+			if (n < ERR_CHECK_LIMIT) {
+				printf("Found %d error bytes\n", n);
+			} else {
+				printf("......\n");
+				printf("Found more than %d error bytes\n", n);
+			}
+		}
+		return -1;
+	}
+
+	printf("Transfer is verified\n");
+	return 0;
+}
+
+static void blk_write64(uint64_t *dev_addr, uint64_t *host_addr, uint64_t bytes)
+{
+	uint64_t qwords = bytes / sizeof(uint64_t);
+
+	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
+		!IS_ALIGNED_QWORD((uint64_t)bytes))
+		return;
+
+	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
+		rte_write64(*host_addr, dev_addr);
+}
+
+static void blk_read64(uint64_t *dev_addr, uint64_t *host_addr, uint64_t bytes)
+{
+	uint64_t qwords = bytes / sizeof(uint64_t);
+
+	if (!IS_ALIGNED_QWORD((uint64_t)dev_addr) ||
+		!IS_ALIGNED_QWORD((uint64_t)bytes))
+		return;
+
+	for (; qwords > 0; qwords--, host_addr++, dev_addr++)
+		*host_addr = rte_read64(dev_addr);
+}
+
+static void switch_ase_page(struct dma_afu_ctx *ctx, uint64_t addr)
+{
+	uint64_t requested_page = addr & ~DMA_ASE_WINDOW_MASK;
+
+	if (!ctx)
+		return;
+
+	if (requested_page != ctx->cur_ase_page) {
+		rte_write64(requested_page, ctx->ase_ctrl_addr);
+		ctx->cur_ase_page = requested_page;
+	}
+}
+
+static int ase_write_unaligned(struct dma_afu_ctx *ctx, uint64_t dev_addr,
+	uint64_t host_addr, uint32_t count)
+{
+	uint64_t dev_aligned_addr = 0;
+	uint64_t shift = 0;
+	uint64_t val = 0;
+	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%x)", host_addr,
+		dev_addr, count);
+
+	if (!ctx || (count >= QWORD_BYTES))
+		return -EINVAL;
+
+	if (!count)
+		return 0;
+
+	switch_ase_page(ctx, dev_addr);
+
+	shift = dev_addr % QWORD_BYTES;
+	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
+	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
+	rte_memcpy(((char *)(&val)) + shift, (void *)addr, count);
+
+	/* write back to device */
+	rte_write64(val, ctx->ase_data_addr + dev_aligned_addr);
+
+	return 0;
+}
+
+static int ase_write(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
+	uint64_t *src_ptr, uint64_t *count)
+{
+	uint64_t src = *src_ptr;
+	uint64_t dst = *dst_ptr;
+	uint64_t align_bytes = *count;
+	uint64_t offset = 0;
+	uint64_t left_in_page = DMA_ASE_WINDOW;
+	uint64_t size_to_copy = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")", src, dst,
+		align_bytes);
+
+	if (!ctx || !IS_ALIGNED_DWORD(dst))
+		return -EINVAL;
+
+	if (align_bytes < DWORD_BYTES)
+		return 0;
+
+	if (!IS_ALIGNED_QWORD(dst)) {
+		/* Write out a single DWORD to get QWORD aligned */
+		switch_ase_page(ctx, dst);
+		offset = dst & DMA_ASE_WINDOW_MASK;
+
+		rte_write32(*(uint32_t *)(uintptr_t)src,
+			ctx->ase_data_addr + offset);
+		src += DWORD_BYTES;
+		dst += DWORD_BYTES;
+		align_bytes -= DWORD_BYTES;
+	}
+
+	if (!align_bytes)
+		return 0;
+
+	/* Write out blocks of 64-bit values */
+	while (align_bytes >= QWORD_BYTES) {
+		left_in_page -= dst & DMA_ASE_WINDOW_MASK;
+		size_to_copy =
+			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+		if (size_to_copy < QWORD_BYTES)
+			break;
+		switch_ase_page(ctx, dst);
+		offset = dst & DMA_ASE_WINDOW_MASK;
+		blk_write64((uint64_t *)(ctx->ase_data_addr + offset),
+			(uint64_t *)(uintptr_t)src, size_to_copy);
+		src += size_to_copy;
+		dst += size_to_copy;
+		align_bytes -= size_to_copy;
+	}
+
+	if (align_bytes >= DWORD_BYTES) {
+		/* Write out remaining DWORD */
+		switch_ase_page(ctx, dst);
+		offset = dst & DMA_ASE_WINDOW_MASK;
+		rte_write32(*(uint32_t *)(uintptr_t)src,
+			ctx->ase_data_addr + offset);
+		src += DWORD_BYTES;
+		dst += DWORD_BYTES;
+		align_bytes -= DWORD_BYTES;
+	}
+
+	*src_ptr = src;
+	*dst_ptr = dst;
+	*count = align_bytes;
+
+	return 0;
+}
+
+static int ase_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t *dst_ptr,
+	uint64_t *src_ptr, uint64_t count)
+{
+	uint64_t dst = *dst_ptr;
+	uint64_t src = *src_ptr;
+	uint64_t count_left = count;
+	uint64_t unaligned_size = 0;
+	int ret = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")", src, dst,
+		count);
+
+	/* aligns address to 8 byte using dst masking method */
+	if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
+		unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+		if (unaligned_size > count_left)
+			unaligned_size = count_left;
+		ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
+		if (ret)
+			return ret;
+		count_left -= unaligned_size;
+		src += unaligned_size;
+		dst += unaligned_size;
+	}
+
+	/* Handles 8/4 byte MMIO transfer */
+	ret = ase_write(ctx, &dst, &src, &count_left);
+	if (ret)
+		return ret;
+
+	/* Left over unaligned bytes transferred using dst masking method */
+	unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+	if (unaligned_size > count_left)
+		unaligned_size = count_left;
+
+	ret = ase_write_unaligned(ctx, dst, src, unaligned_size);
+	if (ret)
+		return ret;
+
+	count_left -= unaligned_size;
+	*dst_ptr = dst + unaligned_size;
+	*src_ptr = src + unaligned_size;
+
+	return 0;
+}
+
+static int ase_read_unaligned(struct dma_afu_ctx *ctx, uint64_t dev_addr,
+	uint64_t host_addr, uint32_t count)
+{
+	uint64_t dev_aligned_addr = 0;
+	uint64_t shift = 0;
+	uint64_t val = 0;
+	uintptr_t addr = (uintptr_t)host_addr;  /* transfer to pointer size */
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64" (0x%x)", host_addr,
+		dev_addr, count);
+
+	if (!ctx || (count >= QWORD_BYTES))
+		return -EINVAL;
+
+	if (!count)
+		return 0;
+
+	switch_ase_page(ctx, dev_addr);
+
+	shift = dev_addr % QWORD_BYTES;
+	dev_aligned_addr = (dev_addr - shift) & DMA_ASE_WINDOW_MASK;
+	val = rte_read64(ctx->ase_data_addr + dev_aligned_addr);
+	rte_memcpy((void *)addr, ((char *)(&val)) + shift, count);
+
+	return 0;
+}
+
+static int ase_read(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
+	uint64_t *dst_ptr, uint64_t *count)
+{
+	uint64_t src = *src_ptr;
+	uint64_t dst = *dst_ptr;
+	uint64_t align_bytes = *count;
+	uint64_t offset = 0;
+	uint64_t left_in_page = DMA_ASE_WINDOW;
+	uint64_t size_to_copy = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" <-- 0x%"PRIx64" (0x%"PRIx64")", dst, src,
+		align_bytes);
+
+	if (!ctx || !IS_ALIGNED_DWORD(src))
+		return -EINVAL;
+
+	if (align_bytes < DWORD_BYTES)
+		return 0;
+
+	if (!IS_ALIGNED_QWORD(src)) {
+		/* Read a single DWORD to get QWORD aligned */
+		switch_ase_page(ctx, src);
+		offset = src & DMA_ASE_WINDOW_MASK;
+		*(uint32_t *)(uintptr_t)dst =
+			rte_read32(ctx->ase_data_addr + offset);
+		src += DWORD_BYTES;
+		dst += DWORD_BYTES;
+		align_bytes -= DWORD_BYTES;
+	}
+
+	if (!align_bytes)
+		return 0;
+
+	/* Read blocks of 64-bit values */
+	while (align_bytes >= QWORD_BYTES) {
+		left_in_page -= src & DMA_ASE_WINDOW_MASK;
+		size_to_copy =
+			MIN(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+		if (size_to_copy < QWORD_BYTES)
+			break;
+		switch_ase_page(ctx, src);
+		offset = src & DMA_ASE_WINDOW_MASK;
+		blk_read64((uint64_t *)(ctx->ase_data_addr + offset),
+			(uint64_t *)(uintptr_t)dst, size_to_copy);
+		src += size_to_copy;
+		dst += size_to_copy;
+		align_bytes -= size_to_copy;
+	}
+
+	if (align_bytes >= DWORD_BYTES) {
+		/* Read remaining DWORD */
+		switch_ase_page(ctx, src);
+		offset = src & DMA_ASE_WINDOW_MASK;
+		*(uint32_t *)(uintptr_t)dst =
+			rte_read32(ctx->ase_data_addr + offset);
+		src += DWORD_BYTES;
+		dst += DWORD_BYTES;
+		align_bytes -= DWORD_BYTES;
+	}
+
+	*src_ptr = src;
+	*dst_ptr = dst;
+	*count = align_bytes;
+
+	return 0;
+}
+
+static int ase_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t *src_ptr,
+	uint64_t *dst_ptr, uint64_t count)
+{
+	uint64_t src = *src_ptr;
+	uint64_t dst = *dst_ptr;
+	uint64_t count_left = count;
+	uint64_t unaligned_size = 0;
+	int ret = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64" (0x%"PRIx64")", src, dst,
+		count);
+
+	/* Aligns address to 8 byte using src masking method */
+	if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
+		unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+		if (unaligned_size > count_left)
+			unaligned_size = count_left;
+		ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
+		if (ret)
+			return ret;
+		count_left -= unaligned_size;
+		dst += unaligned_size;
+		src += unaligned_size;
+	}
+
+	/* Handles 8/4 byte MMIO transfer */
+	ret = ase_read(ctx, &src, &dst, &count_left);
+	if (ret)
+		return ret;
+
+	/* Left over unaligned bytes transferred using src masking method */
+	unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+	if (unaligned_size > count_left)
+		unaligned_size = count_left;
+
+	ret = ase_read_unaligned(ctx, src, dst, unaligned_size);
+	if (ret)
+		return ret;
+
+	count_left -= unaligned_size;
+	*dst_ptr = dst + unaligned_size;
+	*src_ptr = src + unaligned_size;
+
+	return 0;
+}
+
+static void clear_interrupt(struct dma_afu_ctx *ctx)
+{
+	/* clear interrupt by writing 1 to IRQ bit in status register */
+	msgdma_status status;
+
+	if (!ctx)
+		return;
+
+	status.csr = 0;
+	status.irq = 1;
+	rte_write32(status.csr, CSR_STATUS(ctx->csr_addr));
+}
+
+static int poll_interrupt(struct dma_afu_ctx *ctx)
+{
+	struct pollfd pfd = {0};
+	uint64_t count = 0;
+	ssize_t bytes_read = 0;
+	int poll_ret = 0;
+	int ret = 0;
+
+	if (!ctx || (ctx->event_fd < 0))
+		return -EINVAL;
+
+	pfd.fd = ctx->event_fd;
+	pfd.events = POLLIN;
+	poll_ret = poll(&pfd, 1, DMA_TIMEOUT_MSEC);
+	if (poll_ret < 0) {
+		AFU_MF_PMD_ERR("Error %s", strerror(errno));
+		ret = -EFAULT;
+		goto out;
+	} else if (poll_ret == 0) {
+		AFU_MF_PMD_ERR("Timeout");
+		ret = -ETIMEDOUT;
+	} else {
+		bytes_read = read(pfd.fd, &count, sizeof(count));
+		if (bytes_read > 0) {
+			if (ctx->verbose)
+				AFU_MF_PMD_DEBUG("Successful, ret %d, cnt %"PRIu64,
+					poll_ret, count);
+			ret = 0;
+		} else {
+			AFU_MF_PMD_ERR("Failed %s", bytes_read > 0 ?
+				strerror(errno) : "zero bytes read");
+			ret = -EIO;
+		}
+	}
+out:
+	clear_interrupt(ctx);
+	return ret;
+}
+
+static void send_descriptor(struct dma_afu_ctx *ctx, msgdma_ext_desc *desc)
+{
+	msgdma_status status;
+	uint64_t fpga_queue_full = 0;
+
+	if (!ctx)
+		return;
+
+	if (ctx->verbose) {
+		AFU_MF_PMD_DEBUG("descriptor.rd_address = 0x%x%08x",
+			desc->rd_address_ext, desc->rd_address);
+		AFU_MF_PMD_DEBUG("descriptor.wr_address = 0x%x%08x",
+			desc->wr_address_ext, desc->wr_address);
+		AFU_MF_PMD_DEBUG("descriptor.len = %u", desc->len);
+		AFU_MF_PMD_DEBUG("descriptor.wr_burst_count = %u",
+			desc->wr_burst_count);
+		AFU_MF_PMD_DEBUG("descriptor.rd_burst_count = %u",
+			desc->rd_burst_count);
+		AFU_MF_PMD_DEBUG("descriptor.wr_stride %u", desc->wr_stride);
+		AFU_MF_PMD_DEBUG("descriptor.rd_stride %u", desc->rd_stride);
+	}
+
+	do {
+		status.csr = rte_read32(CSR_STATUS(ctx->csr_addr));
+		if (fpga_queue_full++ > 100000000) {
+			AFU_MF_PMD_DEBUG("DMA queue full retry");
+			fpga_queue_full = 0;
+		}
+	} while (status.desc_buf_full);
+
+	blk_write64((uint64_t *)ctx->desc_addr, (uint64_t *)desc,
+		sizeof(*desc));
+}
+
+static int do_dma(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	int count, int is_last_desc, fpga_dma_type type, int intr_en)
+{
+	msgdma_ext_desc *desc = NULL;
+	int alignment_offset = 0;
+	int segment_size = 0;
+
+	if (!ctx)
+		return -EINVAL;
+
+	/* src, dst and count must be 64-byte aligned */
+	if (!IS_DMA_ALIGNED(src) || !IS_DMA_ALIGNED(dst) ||
+		!IS_DMA_ALIGNED(count))
+		return -EINVAL;
+	memset(ctx->desc_buf, 0, sizeof(msgdma_ext_desc));
+
+	/* these fields are fixed for all DMA transfers */
+	desc = ctx->desc_buf;
+	desc->seq_num = 0;
+	desc->wr_stride = 1;
+	desc->rd_stride = 1;
+	desc->control.go = 1;
+	if (intr_en)
+		desc->control.transfer_irq_en = 1;
+	else
+		desc->control.transfer_irq_en = 0;
+
+	if (!is_last_desc)
+		desc->control.early_done_en = 1;
+	else
+		desc->control.early_done_en = 0;
+
+	if (type == FPGA_TO_FPGA) {
+		desc->rd_address = src & DMA_MASK_32_BIT;
+		desc->wr_address = dst & DMA_MASK_32_BIT;
+		desc->len = count;
+		desc->wr_burst_count = 4;
+		desc->rd_burst_count = 4;
+		desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
+		desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
+		send_descriptor(ctx, desc);
+	} else {
+		/* check CCIP (host) address is aligned to 4CL (256B) */
+		alignment_offset = (type == HOST_TO_FPGA)
+			? (src % CCIP_ALIGN_BYTES) : (dst % CCIP_ALIGN_BYTES);
+		/* performing a short transfer to get aligned */
+		if (alignment_offset != 0) {
+			desc->rd_address = src & DMA_MASK_32_BIT;
+			desc->wr_address = dst & DMA_MASK_32_BIT;
+			desc->wr_burst_count = 1;
+			desc->rd_burst_count = 1;
+			desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
+			desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
+			/* count isn't large enough to hit next 4CL boundary */
+			if ((CCIP_ALIGN_BYTES - alignment_offset) >= count) {
+				segment_size = count;
+				count = 0;
+			} else {
+				segment_size = CCIP_ALIGN_BYTES
+					- alignment_offset;
+				src += segment_size;
+				dst += segment_size;
+				count -= segment_size;
+				desc->control.transfer_irq_en = 0;
+			}
+			/* post short transfer to align to a 4CL (256 byte) */
+			desc->len = segment_size;
+			send_descriptor(ctx, desc);
+		}
+		/* at this point we are 4CL (256 byte) aligned */
+		if (count >= CCIP_ALIGN_BYTES) {
+			desc->rd_address = src & DMA_MASK_32_BIT;
+			desc->wr_address = dst & DMA_MASK_32_BIT;
+			desc->wr_burst_count = 4;
+			desc->rd_burst_count = 4;
+			desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
+			desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
+			/* buffer ends on 4CL boundary */
+			if ((count % CCIP_ALIGN_BYTES) == 0) {
+				segment_size = count;
+				count = 0;
+			} else {
+				segment_size = count
+					- (count % CCIP_ALIGN_BYTES);
+				src += segment_size;
+				dst += segment_size;
+				count -= segment_size;
+				desc->control.transfer_irq_en = 0;
+			}
+			desc->len = segment_size;
+			send_descriptor(ctx, desc);
+		}
+		/* post short transfer to handle the remainder */
+		if (count > 0) {
+			desc->rd_address = src & DMA_MASK_32_BIT;
+			desc->wr_address = dst & DMA_MASK_32_BIT;
+			desc->len = count;
+			desc->wr_burst_count = 1;
+			desc->rd_burst_count = 1;
+			desc->rd_address_ext = (src >> 32) & DMA_MASK_32_BIT;
+			desc->wr_address_ext = (dst >> 32) & DMA_MASK_32_BIT;
+			if (intr_en)
+				desc->control.transfer_irq_en = 1;
+			send_descriptor(ctx, desc);
+		}
+	}
+
+	return 0;
+}
+
+static int issue_magic(struct dma_afu_ctx *ctx)
+{
+	*(ctx->magic_buf) = 0ULL;
+	return do_dma(ctx, DMA_WF_HOST_ADDR(ctx->magic_iova),
+		DMA_WF_MAGIC_ROM, 64, 1, FPGA_TO_HOST, 1);
+}
+
+static void wait_magic(struct dma_afu_ctx *ctx)
+{
+	int magic_timeout = 0;
+
+	if (!ctx)
+		return;
+
+	poll_interrupt(ctx);
+	while (*(ctx->magic_buf) != DMA_WF_MAGIC) {
+		if (magic_timeout++ > 1000) {
+			AFU_MF_PMD_ERR("DMA magic operation timeout");
+			magic_timeout = 0;
+			break;
+		}
+	}
+	*(ctx->magic_buf) = 0ULL;
+}
+
+static int dma_tx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	uint64_t chunk, int is_last_chunk, int *intr_issued)
+{
+	int intr_en = 0;
+	int ret = 0;
+
+	if (!ctx || !intr_issued)
+		return -EINVAL;
+
+	src += chunk * ctx->dma_buf_size;
+	dst += chunk * ctx->dma_buf_size;
+
+	if (((chunk % HALF_DMA_BUF) == (HALF_DMA_BUF - 1)) || is_last_chunk) {
+		if (*intr_issued) {
+			ret = poll_interrupt(ctx);
+			if (ret)
+				return ret;
+		}
+		intr_en = 1;
+	}
+
+	chunk %= NUM_DMA_BUF;
+	rte_memcpy(ctx->dma_buf[chunk], (void *)(uintptr_t)src,
+		ctx->dma_buf_size);
+	ret = do_dma(ctx, dst, DMA_HOST_ADDR(ctx->dma_iova[chunk]),
+			ctx->dma_buf_size, 0, HOST_TO_FPGA, intr_en);
+	if (intr_en)
+		*intr_issued = 1;
+
+	return ret;
+}
+
+static int dma_host_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	size_t count)
+{
+	uint64_t i = 0;
+	uint64_t count_left = count;
+	uint64_t aligned_addr = 0;
+	uint64_t align_bytes = 0;
+	uint64_t dma_chunks = 0;
+	uint64_t dma_tx_bytes = 0;
+	uint64_t offset = 0;
+	int issued_intr = 0;
+	int ret = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
+		count);
+
+	if (!ctx)
+		return -EINVAL;
+
+	if (!IS_DMA_ALIGNED(dst)) {
+		if (count_left < DMA_ALIGN_BYTES)
+			return ase_host_to_fpga(ctx, &dst, &src, count_left);
+
+		aligned_addr = ((dst / DMA_ALIGN_BYTES) + 1)
+			* DMA_ALIGN_BYTES;
+		align_bytes = aligned_addr - dst;
+		ret = ase_host_to_fpga(ctx, &dst, &src, align_bytes);
+		if (ret)
+			return ret;
+		count_left = count_left - align_bytes;
+	}
+
+	if (count_left) {
+		dma_chunks = count_left / ctx->dma_buf_size;
+		offset = dma_chunks * ctx->dma_buf_size;
+		count_left -= offset;
+		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
+			" (%"PRIu64"...0x%"PRIx64")",
+			src, dst, dma_chunks, count_left);
+		for (i = 0; i < dma_chunks; i++) {
+			ret = dma_tx_buf(ctx, dst, src, i,
+				i == (dma_chunks - 1), &issued_intr);
+			if (ret)
+				return ret;
+		}
+
+		if (issued_intr) {
+			ret = poll_interrupt(ctx);
+			if (ret)
+				return ret;
+		}
+
+		if (count_left) {
+			i = count_left / DMA_ALIGN_BYTES;
+			if (i > 0) {
+				dma_tx_bytes = i * DMA_ALIGN_BYTES;
+				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to DMA",
+					dma_tx_bytes);
+				rte_memcpy(ctx->dma_buf[0],
+					(void *)(uintptr_t)(src + offset),
+					dma_tx_bytes);
+				ret = do_dma(ctx, dst + offset,
+					DMA_HOST_ADDR(ctx->dma_iova[0]),
+					dma_tx_bytes, 1, HOST_TO_FPGA, 1);
+				if (ret)
+					return ret;
+				ret = poll_interrupt(ctx);
+				if (ret)
+					return ret;
+			}
+
+			count_left -= dma_tx_bytes;
+			if (count_left) {
+				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to ASE",
+					count_left);
+				dst += offset + dma_tx_bytes;
+				src += offset + dma_tx_bytes;
+				ret = ase_host_to_fpga(ctx, &dst, &src,
+					count_left);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int dma_rx_buf(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	uint64_t chunk, int is_last_chunk, uint64_t *rx_count, int *wf_issued)
+{
+	uint64_t i = chunk % NUM_DMA_BUF;
+	uint64_t n = *rx_count;
+	uint64_t num_pending = 0;
+	int ret = 0;
+
+	if (!ctx || !wf_issued)
+		return -EINVAL;
+
+	ret = do_dma(ctx, DMA_HOST_ADDR(ctx->dma_iova[i]),
+		src + chunk * ctx->dma_buf_size,
+		ctx->dma_buf_size, 1, FPGA_TO_HOST, 0);
+	if (ret)
+		return ret;
+
+	num_pending = chunk - n + 1;
+	if (num_pending == HALF_DMA_BUF) {
+		ret = issue_magic(ctx);
+		if (ret) {
+			AFU_MF_PMD_DEBUG("Magic issue failed");
+			return ret;
+		}
+		*wf_issued = 1;
+	}
+
+	if ((num_pending > (NUM_DMA_BUF - 1)) || is_last_chunk) {
+		if (*wf_issued) {
+			wait_magic(ctx);
+			for (i = 0; i < HALF_DMA_BUF; i++) {
+				rte_memcpy((void *)(uintptr_t)(dst +
+						n * ctx->dma_buf_size),
+					ctx->dma_buf[n % NUM_DMA_BUF],
+					ctx->dma_buf_size);
+				n++;
+			}
+			*wf_issued = 0;
+			*rx_count = n;
+		}
+		ret = issue_magic(ctx);
+		if (ret) {
+			AFU_MF_PMD_DEBUG("Magic issue failed");
+			return ret;
+		}
+		*wf_issued = 1;
+	}
+
+	return ret;
+}
+
+static int dma_fpga_to_host(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	size_t count)
+{
+	uint64_t i = 0;
+	uint64_t count_left = count;
+	uint64_t aligned_addr = 0;
+	uint64_t align_bytes = 0;
+	uint64_t dma_chunks = 0;
+	uint64_t pending_buf = 0;
+	uint64_t dma_rx_bytes = 0;
+	uint64_t offset = 0;
+	int wf_issued = 0;
+	int ret = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
+		count);
+
+	if (!ctx)
+		return -EINVAL;
+
+	if (!IS_DMA_ALIGNED(src)) {
+		if (count_left < DMA_ALIGN_BYTES)
+			return ase_fpga_to_host(ctx, &src, &dst, count_left);
+
+		aligned_addr = ((src / DMA_ALIGN_BYTES) + 1)
+			 * DMA_ALIGN_BYTES;
+		align_bytes = aligned_addr - src;
+		ret = ase_fpga_to_host(ctx, &src, &dst, align_bytes);
+		if (ret)
+			return ret;
+		count_left = count_left - align_bytes;
+	}
+
+	if (count_left) {
+		dma_chunks = count_left / ctx->dma_buf_size;
+		offset = dma_chunks * ctx->dma_buf_size;
+		count_left -= offset;
+		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
+			" (%"PRIu64"...0x%"PRIx64")",
+			src, dst, dma_chunks, count_left);
+		for (i = 0; i < dma_chunks; i++) {
+			ret = dma_rx_buf(ctx, dst, src, i,
+				i == (dma_chunks - 1),
+				&pending_buf, &wf_issued);
+			if (ret)
+				return ret;
+		}
+
+		if (wf_issued)
+			wait_magic(ctx);
+
+		/* clear out final dma memcpy operations */
+		while (pending_buf < dma_chunks) {
+			/* constant size transfer; no length check required */
+			rte_memcpy((void *)(uintptr_t)(dst +
+					pending_buf * ctx->dma_buf_size),
+				ctx->dma_buf[pending_buf % NUM_DMA_BUF],
+				ctx->dma_buf_size);
+			pending_buf++;
+		}
+
+		if (count_left > 0) {
+			i = count_left / DMA_ALIGN_BYTES;
+			if (i > 0) {
+				dma_rx_bytes = i * DMA_ALIGN_BYTES;
+				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to DMA",
+					dma_rx_bytes);
+				ret = do_dma(ctx,
+					DMA_HOST_ADDR(ctx->dma_iova[0]),
+					src + offset,
+					dma_rx_bytes, 1, FPGA_TO_HOST, 0);
+				if (ret)
+					return ret;
+				ret = issue_magic(ctx);
+				if (ret)
+					return ret;
+				wait_magic(ctx);
+				rte_memcpy((void *)(uintptr_t)(dst + offset),
+					ctx->dma_buf[0], dma_rx_bytes);
+			}
+
+			count_left -= dma_rx_bytes;
+			if (count_left) {
+				AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to ASE",
+					count_left);
+				dst += offset + dma_rx_bytes;
+				src += offset + dma_rx_bytes;
+				ret = ase_fpga_to_host(ctx, &src, &dst,
+							count_left);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int dma_fpga_to_fpga(struct dma_afu_ctx *ctx, uint64_t dst, uint64_t src,
+	size_t count)
+{
+	uint64_t i = 0;
+	uint64_t count_left = count;
+	uint64_t dma_chunks = 0;
+	uint64_t offset = 0;
+	uint32_t tx_chunks = 0;
+	uint64_t *tmp_buf = NULL;
+	int ret = 0;
+
+	AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64" (%zu)", src, dst,
+		count);
+
+	if (!ctx)
+		return -EINVAL;
+
+	if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src)
+	    && IS_DMA_ALIGNED(count_left)) {
+		dma_chunks = count_left / ctx->dma_buf_size;
+		offset = dma_chunks * ctx->dma_buf_size;
+		count_left -= offset;
+		AFU_MF_PMD_DEBUG("0x%"PRIx64" ---> 0x%"PRIx64
+			" (%"PRIu64"...0x%"PRIx64")",
+			src, dst, dma_chunks, count_left);
+		for (i = 0; i < dma_chunks; i++) {
+			ret = do_dma(ctx, dst + i * ctx->dma_buf_size,
+				src + i * ctx->dma_buf_size,
+				ctx->dma_buf_size, 0, FPGA_TO_FPGA, 0);
+			if (ret)
+				return ret;
+			if ((((i + 1) % NUM_DMA_BUF) == 0) ||
+				(i == (dma_chunks - 1))) {
+				ret = issue_magic(ctx);
+				if (ret)
+					return ret;
+				wait_magic(ctx);
+			}
+		}
+
+		if (count_left > 0) {
+			AFU_MF_PMD_DEBUG("left over 0x%"PRIx64" to DMA", count_left);
+			ret = do_dma(ctx, dst + offset, src + offset,
+				count_left, 1, FPGA_TO_FPGA, 0);
+			if (ret)
+				return ret;
+			ret = issue_magic(ctx);
+			if (ret)
+				return ret;
+			wait_magic(ctx);
+		}
+	} else {
+		if ((src < dst) && (src + count_left > dst)) {
+			AFU_MF_PMD_ERR("Overlapping: 0x%"PRIx64
+				" -> 0x%"PRIx64" (0x%"PRIx64")",
+				src, dst, count_left);
+			return -EINVAL;
+		}
+		tx_chunks = count_left / ctx->dma_buf_size;
+		offset = tx_chunks * ctx->dma_buf_size;
+		count_left -= offset;
+		AFU_MF_PMD_DEBUG("0x%"PRIx64" --> 0x%"PRIx64
+			" (%u...0x%"PRIx64")",
+			src, dst, tx_chunks, count_left);
+		tmp_buf = (uint64_t *)rte_malloc(NULL, ctx->dma_buf_size,
+			DMA_ALIGN_BYTES);
+		for (i = 0; i < tx_chunks; i++) {
+			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
+				src + i * ctx->dma_buf_size,
+				ctx->dma_buf_size);
+			if (ret)
+				goto free_buf;
+			ret = dma_host_to_fpga(ctx,
+				dst + i * ctx->dma_buf_size,
+				(uint64_t)tmp_buf, ctx->dma_buf_size);
+			if (ret)
+				goto free_buf;
+		}
+
+		if (count_left > 0) {
+			ret = dma_fpga_to_host(ctx, (uint64_t)tmp_buf,
+				src + offset, count_left);
+			if (ret)
+				goto free_buf;
+			ret = dma_host_to_fpga(ctx, dst + offset,
+				(uint64_t)tmp_buf, count_left);
+			if (ret)
+				goto free_buf;
+		}
+free_buf:
+		rte_free(tmp_buf);
+	}
+
+	return ret;
+}
+
+static int dma_transfer_sync(struct dma_afu_ctx *ctx, uint64_t dst,
+	uint64_t src, size_t count, fpga_dma_type type)
+{
+	int ret = 0;
+
+	if (!ctx)
+		return -EINVAL;
+
+	if (type == HOST_TO_FPGA)
+		ret = dma_host_to_fpga(ctx, dst, src, count);
+	else if (type == FPGA_TO_HOST)
+		ret = dma_fpga_to_host(ctx, dst, src, count);
+	else if (type == FPGA_TO_FPGA)
+		ret = dma_fpga_to_fpga(ctx, dst, src, count);
+	else
+		return -EINVAL;
+
+	return ret;
+}
+
+static double getTime(struct timespec start, struct timespec end)
+{
+	uint64_t diff = 1000000000L * (end.tv_sec - start.tv_sec)
+		+ end.tv_nsec - start.tv_nsec;
+	return (double)diff / (double)1000000000L;
+}
+
+#define SWEEP_ITERS 1
+static int sweep_test(struct dma_afu_ctx *ctx, uint32_t length,
+	uint64_t ddr_offset, uint64_t buf_offset, uint64_t size_decrement)
+{
+	struct timespec start, end;
+	uint64_t test_size = 0;
+	uint64_t *dma_buf_ptr = NULL;
+	double throughput, total_time = 0.0;
+	int i = 0;
+	int ret = 0;
+
+	if (!ctx || !ctx->data_buf || !ctx->ref_buf) {
+		AFU_MF_PMD_ERR("Buffer for DMA test is not allocated");
+		return -EINVAL;
+	}
+
+	if (length < (buf_offset + size_decrement)) {
+		AFU_MF_PMD_ERR("Test length does not match unaligned parameter");
+		return -EINVAL;
+	}
+	test_size = length - (buf_offset + size_decrement);
+	if ((ddr_offset + test_size) > ctx->mem_size) {
+		AFU_MF_PMD_ERR("Test is out of DDR memory space");
+		return -EINVAL;
+	}
+
+	dma_buf_ptr = (uint64_t *)((uint8_t *)ctx->data_buf + buf_offset);
+	printf("Sweep Host %p to FPGA 0x%"PRIx64
+		" with 0x%"PRIx64" bytes ...\n",
+		(void *)dma_buf_ptr, ddr_offset, test_size);
+
+	for (i = 0; i < SWEEP_ITERS; i++) {
+		clock_gettime(CLOCK_MONOTONIC, &start);
+		ret = dma_transfer_sync(ctx, ddr_offset, (uint64_t)dma_buf_ptr,
+			test_size, HOST_TO_FPGA);
+		clock_gettime(CLOCK_MONOTONIC, &end);
+		if (ret) {
+			AFU_MF_PMD_ERR("Failed");
+			return ret;
+		}
+		total_time += getTime(start, end);
+	}
+	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
+	printf("Measured bandwidth = %lf MB/s\n", throughput);
+
+	printf("Sweep FPGA 0x%"PRIx64" to Host %p with 0x%"PRIx64" bytes ...\n",
+		ddr_offset, (void *)dma_buf_ptr, test_size);
+
+	total_time = 0.0;
+	memset((char *)dma_buf_ptr, 0, test_size);
+	for (i = 0; i < SWEEP_ITERS; i++) {
+		clock_gettime(CLOCK_MONOTONIC, &start);
+		ret = dma_transfer_sync(ctx, (uint64_t)dma_buf_ptr, ddr_offset,
+			test_size, FPGA_TO_HOST);
+		clock_gettime(CLOCK_MONOTONIC, &end);
+		if (ret) {
+			AFU_MF_PMD_ERR("Failed");
+			return ret;
+		}
+		total_time += getTime(start, end);
+	}
+	throughput = (test_size * SWEEP_ITERS) / (total_time * 1000000);
+	printf("Measured bandwidth = %lf MB/s\n", throughput);
+
+	printf("Verifying buffer ...\n");
+	return dma_afu_buf_verify(ctx, test_size);
+}
+
+static int dma_afu_test(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct dma_afu_ctx *ctx = NULL;
+	struct rte_pmd_afu_dma_cfg *cfg = NULL;
+	msgdma_ctrl ctrl;
+	uint64_t offset = 0;
+	uint32_t i = 0;
+	int ret = 0;
+
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->priv)
+		return -ENOENT;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	cfg = &priv->dma_cfg;
+	if (cfg->index >= NUM_N3000_DMA)
+		return -EINVAL;
+	ctx = &priv->dma_ctx[cfg->index];
+
+	ctx->pattern = (int)cfg->pattern;
+	ctx->verbose = (int)cfg->verbose;
+	ctx->dma_buf_size = cfg->size;
+
+	ret = dma_afu_buf_alloc(ctx, cfg);
+	if (ret)
+		goto free;
+
+	printf("Initialize test buffer\n");
+	dma_afu_buf_init(ctx, cfg->length);
+
+	/* enable interrupt */
+	ctrl.csr = 0;
+	ctrl.global_intr_en_mask = 1;
+	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
+
+	printf("Host %p to FPGA 0x%x with 0x%x bytes\n", ctx->data_buf,
+		cfg->offset, cfg->length);
+	ret = dma_transfer_sync(ctx, cfg->offset, (uint64_t)ctx->data_buf,
+		cfg->length, HOST_TO_FPGA);
+	if (ret) {
+		AFU_MF_PMD_ERR("Failed to transfer data from host to FPGA");
+		goto end;
+	}
+	memset(ctx->data_buf, 0, cfg->length);
+
+	printf("FPGA 0x%x to Host %p with 0x%x bytes\n", cfg->offset,
+		ctx->data_buf, cfg->length);
+	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, cfg->offset,
+		cfg->length, FPGA_TO_HOST);
+	if (ret) {
+		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to host");
+		goto end;
+	}
+	ret = dma_afu_buf_verify(ctx, cfg->length);
+	if (ret)
+		goto end;
+
+	if ((cfg->offset + cfg->length * 2) <= ctx->mem_size)
+		offset = cfg->offset + cfg->length;
+	else if (cfg->offset > cfg->length)
+		offset = 0;
+	else
+		goto end;
+
+	printf("FPGA 0x%x to FPGA 0x%"PRIx64" with 0x%x bytes\n",
+		cfg->offset, offset, cfg->length);
+	ret = dma_transfer_sync(ctx, offset, cfg->offset, cfg->length,
+		FPGA_TO_FPGA);
+	if (ret) {
+		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to FPGA");
+		goto end;
+	}
+
+	printf("FPGA 0x%"PRIx64" to Host %p with 0x%x bytes\n", offset,
+		ctx->data_buf, cfg->length);
+	ret = dma_transfer_sync(ctx, (uint64_t)ctx->data_buf, offset,
+		cfg->length, FPGA_TO_HOST);
+	if (ret) {
+		AFU_MF_PMD_ERR("Failed to transfer data from FPGA to host");
+		goto end;
+	}
+	ret = dma_afu_buf_verify(ctx, cfg->length);
+	if (ret)
+		goto end;
+
+	printf("Sweep with aligned address and size\n");
+	ret = sweep_test(ctx, cfg->length, cfg->offset, 0, 0);
+	if (ret)
+		goto end;
+
+	if (cfg->unaligned) {
+		printf("Sweep with unaligned address and size\n");
+		struct unaligned_set {
+			uint64_t addr_offset;
+			uint64_t size_dec;
+		} param[] = {{61, 5}, {3, 0}, {7, 3}, {0, 3}, {0, 61}, {0, 7}};
+		for (i = 0; i < ARRAY_SIZE(param); i++) {
+			ret = sweep_test(ctx, cfg->length, cfg->offset,
+				param[i].addr_offset, param[i].size_dec);
+			if (ret)
+				break;
+		}
+	}
+
+end:
+	/* disable interrupt */
+	ctrl.global_intr_en_mask = 0;
+	rte_write32(ctrl.csr, CSR_CONTROL(ctx->csr_addr));
+
+free:
+	dma_afu_buf_free(ctx);
+	return ret;
+}
+
+static struct rte_pci_device *n3000_afu_get_pci_dev(struct afu_mf_rawdev *dev)
+{
+	struct rte_afu_device *afudev = NULL;
+
+	if (!dev || !dev->rawdev || !dev->rawdev->device)
+		return NULL;
+
+	afudev = RTE_DEV_TO_AFU(dev->rawdev->device);
+	if (!afudev->rawdev || !afudev->rawdev->device)
+		return NULL;
+
+	return RTE_DEV_TO_PCI(afudev->rawdev->device);
+}
+
+#ifdef VFIO_PRESENT
+static int dma_afu_set_irqs(struct afu_mf_rawdev *dev, uint32_t vec_start,
+	uint32_t count, int *efds)
+{
+	struct rte_pci_device *pci_dev = NULL;
+	struct vfio_irq_set *irq_set = NULL;
+	int vfio_dev_fd = 0;
+	size_t sz = 0;
+	int ret = 0;
+
+	if (!dev || !efds || (count == 0) || (count > MAX_MSIX_VEC))
+		return -EINVAL;
+
+	pci_dev = n3000_afu_get_pci_dev(dev);
+	if (!pci_dev)
+		return -ENODEV;
+	vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle);
+
+	sz = sizeof(*irq_set) + sizeof(*efds) * count;
+	irq_set = rte_zmalloc(NULL, sz, 0);
+	if (!irq_set)
+		return -ENOMEM;
+
+	irq_set->argsz = (uint32_t)sz;
+	irq_set->count = count;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+		VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set->start = vec_start;
+
+	rte_memcpy(&irq_set->data, efds, sizeof(*efds) * count);
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+	if (ret)
+		AFU_MF_PMD_ERR("Error enabling MSI-X interrupts\n");
+
+	rte_free(irq_set);
+	return ret;
+}
+#endif
+
+static void *n3000_afu_get_port_addr(struct afu_mf_rawdev *dev)
+{
+	struct rte_pci_device *pci_dev = NULL;
+	uint8_t *addr = NULL;
+	uint64_t val = 0;
+	uint32_t bar = 0;
+
+	pci_dev = n3000_afu_get_pci_dev(dev);
+	if (!pci_dev)
+		return NULL;
+
+	addr = (uint8_t *)pci_dev->mem_resource[0].addr;
+	val = rte_read64(addr + PORT_ATTR_REG(dev->port));
+	if (!PORT_IMPLEMENTED(val)) {
+		AFU_MF_PMD_INFO("FIU port %d is not implemented", dev->port);
+		return NULL;
+	}
+
+	bar = PORT_BAR(val);
+	if (bar >= PCI_MAX_RESOURCE) {
+		AFU_MF_PMD_ERR("BAR index %u is out of limit", bar);
+		return NULL;
+	}
+
+	addr = (uint8_t *)pci_dev->mem_resource[bar].addr + PORT_OFFSET(val);
+	return addr;
+}
+
+static int n3000_afu_get_irq_capability(struct afu_mf_rawdev *dev,
+	uint32_t *vec_start, uint32_t *vec_count)
+{
+	uint8_t *addr = NULL;
+	uint64_t val = 0;
+	uint64_t header = 0;
+	uint64_t next_offset = 0;
+
+	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
+	if (!addr)
+		return -ENOENT;
+
+	do {
+		addr += next_offset;
+		header = rte_read64(addr);
+		if ((DFH_TYPE(header) == DFH_TYPE_PRIVATE) &&
+			(DFH_FEATURE_ID(header) == PORT_FEATURE_UINT_ID)) {
+			val = rte_read64(addr + PORT_UINT_CAP_REG);
+			if (vec_start)
+				*vec_start = PORT_VEC_START(val);
+			if (vec_count)
+				*vec_count = PORT_VEC_COUNT(val);
+			return 0;
+		}
+		next_offset = DFH_NEXT_OFFSET(header);
+		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
+			break;
+	} while (!DFH_EOL(header));
+
+	return -ENOENT;
+}
+
+static int nlb_afu_ctx_release(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct nlb_afu_ctx *ctx = NULL;
+
+	if (!dev)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	ctx = &priv->nlb_ctx;
+
+	rte_free(ctx->dsm_ptr);
+	ctx->dsm_ptr = NULL;
+	ctx->status_ptr = NULL;
+
+	rte_free(ctx->src_ptr);
+	ctx->src_ptr = NULL;
+
+	rte_free(ctx->dest_ptr);
+	ctx->dest_ptr = NULL;
+
+	return 0;
+}
+
+static int nlb_afu_ctx_init(struct afu_mf_rawdev *dev, uint8_t *addr)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct nlb_afu_ctx *ctx = NULL;
+	int ret = 0;
+
+	if (!dev || !addr)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	ctx = &priv->nlb_ctx;
+	ctx->addr = addr;
+
+	ctx->dsm_ptr = (uint8_t *)rte_zmalloc(NULL, DSM_SIZE, TEST_MEM_ALIGN);
+	if (!ctx->dsm_ptr) {
+		ret = -ENOMEM;
+		goto release;
+	}
+	ctx->dsm_iova = rte_malloc_virt2iova(ctx->dsm_ptr);
+	if (ctx->dsm_iova == RTE_BAD_IOVA) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	ctx->src_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
+		TEST_MEM_ALIGN);
+	if (!ctx->src_ptr) {
+		ret = -ENOMEM;
+		goto release;
+	}
+	ctx->src_iova = rte_malloc_virt2iova(ctx->src_ptr);
+	if (ctx->src_iova == RTE_BAD_IOVA) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	ctx->dest_ptr = (uint8_t *)rte_zmalloc(NULL, NLB_BUF_SIZE,
+		TEST_MEM_ALIGN);
+	if (!ctx->dest_ptr) {
+		ret = -ENOMEM;
+		goto release;
+	}
+	ctx->dest_iova = rte_malloc_virt2iova(ctx->dest_ptr);
+	if (ctx->dest_iova == RTE_BAD_IOVA) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	ctx->status_ptr = (struct nlb_dsm_status *)(ctx->dsm_ptr + DSM_STATUS);
+	return 0;
+
+release:
+	nlb_afu_ctx_release(dev);
+	return ret;
+}
+
+static int dma_afu_ctx_release(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct dma_afu_ctx *ctx = NULL;
+
+	if (!dev)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	ctx = &priv->dma_ctx[0];
+
+	rte_free(ctx->desc_buf);
+	ctx->desc_buf = NULL;
+
+	rte_free(ctx->magic_buf);
+	ctx->magic_buf = NULL;
+
+	close(ctx->event_fd);
+	return 0;
+}
+
+static int dma_afu_ctx_init(struct afu_mf_rawdev *dev, int index, uint8_t *addr)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct dma_afu_ctx *ctx = NULL;
+	uint64_t mem_sz[] = {0x100000000, 0x100000000, 0x40000000, 0x1000000};
+	static int efds[1] = {0};
+	uint32_t vec_start = 0;
+	int ret = 0;
+
+	if (!dev || (index < 0) || (index >= NUM_N3000_DMA) || !addr)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	ctx = &priv->dma_ctx[index];
+	ctx->index = index;
+	ctx->addr = addr;
+	ctx->csr_addr = addr + DMA_CSR;
+	ctx->desc_addr = addr + DMA_DESC;
+	ctx->ase_ctrl_addr = addr + DMA_ASE_CTRL;
+	ctx->ase_data_addr = addr + DMA_ASE_DATA;
+	ctx->mem_size = mem_sz[ctx->index];
+	ctx->cur_ase_page = INVALID_ASE_PAGE;
+	if (ctx->index == 0) {
+		ret = n3000_afu_get_irq_capability(dev, &vec_start, NULL);
+		if (ret)
+			return ret;
+
+		efds[0] = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+		if (efds[0] < 0) {
+			AFU_MF_PMD_ERR("eventfd create failed");
+			return -EBADF;
+		}
+#ifdef VFIO_PRESENT
+		if (dma_afu_set_irqs(dev, vec_start, 1, efds))
+			AFU_MF_PMD_ERR("DMA interrupt setup failed");
+#endif
+	}
+	ctx->event_fd = efds[0];
+
+	ctx->desc_buf = (msgdma_ext_desc *)rte_zmalloc(NULL,
+		sizeof(msgdma_ext_desc), DMA_ALIGN_BYTES);
+	if (!ctx->desc_buf) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	ctx->magic_buf = (uint64_t *)rte_zmalloc(NULL, MAGIC_BUF_SIZE,
+		TEST_MEM_ALIGN);
+	if (!ctx->magic_buf) {
+		ret = -ENOMEM;
+		goto release;
+	}
+	ctx->magic_iova = rte_malloc_virt2iova(ctx->magic_buf);
+	if (ctx->magic_iova == RTE_BAD_IOVA) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	return 0;
+
+release:
+	dma_afu_ctx_release(dev);
+	return ret;
+}
+
+static int n3000_afu_ctx_init(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	uint8_t *addr = NULL;
+	uint64_t header = 0;
+	uint64_t uuid_hi = 0;
+	uint64_t uuid_lo = 0;
+	uint64_t next_offset = 0;
+	int ret = 0;
+
+	if (!dev)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	addr = (uint8_t *)dev->addr;
+	do {
+		addr += next_offset;
+		header = rte_read64(addr);
+		uuid_lo = rte_read64(addr + DFH_UUID_L_OFFSET);
+		uuid_hi = rte_read64(addr + DFH_UUID_H_OFFSET);
+
+		if ((DFH_TYPE(header) == DFH_TYPE_AFU) &&
+			(uuid_lo == N3000_NLB0_UUID_L) &&
+			(uuid_hi == N3000_NLB0_UUID_H)) {
+			AFU_MF_PMD_INFO("AFU NLB0 found @ %p", (void *)addr);
+			ret = nlb_afu_ctx_init(dev, addr);
+			if (ret)
+				return ret;
+		} else if ((DFH_TYPE(header) == DFH_TYPE_BBB) &&
+			(uuid_lo == N3000_DMA_UUID_L) &&
+			(uuid_hi == N3000_DMA_UUID_H) &&
+			(priv->num_dma < NUM_N3000_DMA)) {
+			AFU_MF_PMD_INFO("AFU DMA%d found @ %p",
+				priv->num_dma, (void *)addr);
+			ret = dma_afu_ctx_init(dev, priv->num_dma, addr);
+			if (ret)
+				return ret;
+			priv->num_dma++;
+		} else {
+			AFU_MF_PMD_DEBUG("DFH: type %"PRIu64
+				", uuid %016"PRIx64"%016"PRIx64,
+				DFH_TYPE(header), uuid_hi, uuid_lo);
+		}
+
+		next_offset = DFH_NEXT_OFFSET(header);
+		if (((next_offset & 0xffff) == 0xffff) || (next_offset == 0))
+			break;
+	} while (!DFH_EOL(header));
+
+	return 0;
+}
+
+static int n3000_afu_init(struct afu_mf_rawdev *dev)
+{
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->priv) {
+		dev->priv = rte_zmalloc(NULL, sizeof(struct n3000_afu_priv), 0);
+		if (!dev->priv)
+			return -ENOMEM;
+	}
+
+	return n3000_afu_ctx_init(dev);
+}
+
+static int n3000_afu_config(struct afu_mf_rawdev *dev, void *config,
+	size_t config_size)
+{
+	struct n3000_afu_priv *priv = NULL;
+	struct rte_pmd_afu_n3000_cfg *cfg = NULL;
+	int i = 0;
+	uint64_t top = 0;
+
+	if (!dev || !config || !config_size)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	if (config_size != sizeof(struct rte_pmd_afu_n3000_cfg))
+		return -EINVAL;
+
+	cfg = (struct rte_pmd_afu_n3000_cfg *)config;
+	if (cfg->type == RTE_PMD_AFU_N3000_NLB) {
+		if (cfg->nlb_cfg.mode != NLB_MODE_LPBK)
+			return -EINVAL;
+		if ((cfg->nlb_cfg.read_vc > NLB_VC_RANDOM) ||
+			(cfg->nlb_cfg.write_vc > NLB_VC_RANDOM))
+			return -EINVAL;
+		if (cfg->nlb_cfg.wrfence_vc > NLB_VC_VH1)
+			return -EINVAL;
+		if (cfg->nlb_cfg.cache_hint > NLB_RDLINE_MIXED)
+			return -EINVAL;
+		if (cfg->nlb_cfg.cache_policy > NLB_WRPUSH_I)
+			return -EINVAL;
+		if ((cfg->nlb_cfg.multi_cl != 1) &&
+			(cfg->nlb_cfg.multi_cl != 2) &&
+			(cfg->nlb_cfg.multi_cl != 4))
+			return -EINVAL;
+		if ((cfg->nlb_cfg.begin < MIN_CACHE_LINES) ||
+			(cfg->nlb_cfg.begin > MAX_CACHE_LINES))
+			return -EINVAL;
+		if ((cfg->nlb_cfg.end < cfg->nlb_cfg.begin) ||
+			(cfg->nlb_cfg.end > MAX_CACHE_LINES))
+			return -EINVAL;
+		rte_memcpy(&priv->nlb_cfg, &cfg->nlb_cfg,
+			sizeof(struct rte_pmd_afu_nlb_cfg));
+	} else if (cfg->type == RTE_PMD_AFU_N3000_DMA) {
+		if (cfg->dma_cfg.index >= NUM_N3000_DMA)
+			return -EINVAL;
+		i = cfg->dma_cfg.index;
+		if (cfg->dma_cfg.length > priv->dma_ctx[i].mem_size)
+			return -EINVAL;
+		if (cfg->dma_cfg.offset >= priv->dma_ctx[i].mem_size)
+			return -EINVAL;
+		top = cfg->dma_cfg.length + cfg->dma_cfg.offset;
+		if ((top == 0) || (top > priv->dma_ctx[i].mem_size))
+			return -EINVAL;
+		if (i == 3) {  /* QDR connected to DMA3 */
+			if (cfg->dma_cfg.length & 0x3f) {
+				cfg->dma_cfg.length &= ~0x3f;
+				AFU_MF_PMD_INFO("Round size to %x for QDR",
+					cfg->dma_cfg.length);
+			}
+		}
+		rte_memcpy(&priv->dma_cfg, &cfg->dma_cfg,
+			sizeof(struct rte_pmd_afu_dma_cfg));
+	} else {
+		AFU_MF_PMD_ERR("Invalid type of N3000 AFU");
+		return -EINVAL;
+	}
+
+	priv->cfg_type = cfg->type;
+	return 0;
+}
+
+static int n3000_afu_test(struct afu_mf_rawdev *dev)
+{
+	struct n3000_afu_priv *priv = NULL;
+	int ret = 0;
+
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->priv)
+		return -ENOENT;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+
+	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
+		AFU_MF_PMD_INFO("Test NLB");
+		ret = nlb_afu_test(dev);
+	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
+		AFU_MF_PMD_INFO("Test DMA%u", priv->dma_cfg.index);
+		ret = dma_afu_test(dev);
+	} else {
+		AFU_MF_PMD_ERR("Please configure AFU before test");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int n3000_afu_close(struct afu_mf_rawdev *dev)
+{
+	if (!dev)
+		return -EINVAL;
+
+	nlb_afu_ctx_release(dev);
+	dma_afu_ctx_release(dev);
+
+	rte_free(dev->priv);
+	dev->priv = NULL;
+
+	return 0;
+}
+
+static int n3000_afu_dump(struct afu_mf_rawdev *dev, FILE *f)
+{
+	struct n3000_afu_priv *priv = NULL;
+
+	if (!dev)
+		return -EINVAL;
+
+	priv = (struct n3000_afu_priv *)dev->priv;
+	if (!priv)
+		return -ENOENT;
+
+	if (!f)
+		f = stdout;
+
+	if (priv->cfg_type == RTE_PMD_AFU_N3000_NLB) {
+		struct nlb_afu_ctx *ctx = &priv->nlb_ctx;
+		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
+		fprintf(f, "dsm_ptr:\t%p\n", (void *)ctx->dsm_ptr);
+		fprintf(f, "dsm_iova:\t0x%"PRIx64"\n", ctx->dsm_iova);
+		fprintf(f, "src_ptr:\t%p\n", (void *)ctx->src_ptr);
+		fprintf(f, "src_iova:\t0x%"PRIx64"\n", ctx->src_iova);
+		fprintf(f, "dest_ptr:\t%p\n", (void *)ctx->dest_ptr);
+		fprintf(f, "dest_iova:\t0x%"PRIx64"\n", ctx->dest_iova);
+		fprintf(f, "status_ptr:\t%p\n", (void *)ctx->status_ptr);
+	} else if (priv->cfg_type == RTE_PMD_AFU_N3000_DMA) {
+		struct dma_afu_ctx *ctx = &priv->dma_ctx[priv->dma_cfg.index];
+		fprintf(f, "index:\t\t%d\n", ctx->index);
+		fprintf(f, "addr:\t\t%p\n", (void *)ctx->addr);
+		fprintf(f, "csr_addr:\t%p\n", (void *)ctx->csr_addr);
+		fprintf(f, "desc_addr:\t%p\n", (void *)ctx->desc_addr);
+		fprintf(f, "ase_ctrl_addr:\t%p\n", (void *)ctx->ase_ctrl_addr);
+		fprintf(f, "ase_data_addr:\t%p\n", (void *)ctx->ase_data_addr);
+		fprintf(f, "desc_buf:\t%p\n", (void *)ctx->desc_buf);
+		fprintf(f, "magic_buf:\t%p\n", (void *)ctx->magic_buf);
+		fprintf(f, "magic_iova:\t0x%"PRIx64"\n", ctx->magic_iova);
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int n3000_afu_reset(struct afu_mf_rawdev *dev)
+{
+	uint8_t *addr = NULL;
+	uint64_t val = 0;
+
+	addr = (uint8_t *)n3000_afu_get_port_addr(dev);
+	if (!addr)
+		return -ENOENT;
+
+	val = rte_read64(addr + PORT_CTRL_REG);
+	val |= PORT_SOFT_RESET;
+	rte_write64(val, addr + PORT_CTRL_REG);
+	rte_delay_us(100);
+	val &= ~PORT_SOFT_RESET;
+	rte_write64(val, addr + PORT_CTRL_REG);
+
+	return 0;
+}
+
+static struct afu_mf_ops n3000_afu_ops = {
+	.init = n3000_afu_init,
+	.config = n3000_afu_config,
+	.start = NULL,
+	.stop = NULL,
+	.test = n3000_afu_test,
+	.close = n3000_afu_close,
+	.dump = n3000_afu_dump,
+	.reset = n3000_afu_reset
+};
+
+struct afu_mf_drv n3000_afu_drv = {
+	.uuid = { N3000_AFU_UUID_L, N3000_AFU_UUID_H },
+	.ops = &n3000_afu_ops
+};
diff --git a/drivers/raw/afu_mf/n3000_afu.h b/drivers/raw/afu_mf/n3000_afu.h
new file mode 100644
index 0000000..4c740da
--- /dev/null
+++ b/drivers/raw/afu_mf/n3000_afu.h
@@ -0,0 +1,333 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _N3000_AFU_H_
+#define _N3000_AFU_H_
+
+#include "afu_mf_rawdev.h"
+#include "rte_pmd_afu.h"
+
+#define N3000_AFU_UUID_L  0xc000c9660d824272
+#define N3000_AFU_UUID_H  0x9aeffe5f84570612
+#define N3000_NLB0_UUID_L 0xf89e433683f9040b
+#define N3000_NLB0_UUID_H 0xd8424dc4a4a3c413
+#define N3000_DMA_UUID_L  0xa9149a35bace01ea
+#define N3000_DMA_UUID_H  0xef82def7f6ec40fc
+
+extern struct afu_mf_drv n3000_afu_drv;
+
+#define NUM_N3000_DMA  4
+#define MAX_MSIX_VEC   7
+
+/* N3000 DFL definition */
+#define DFH_UUID_L_OFFSET  8
+#define DFH_UUID_H_OFFSET  16
+#define DFH_TYPE(hdr)  (((hdr) >> 60) & 0xf)
+#define DFH_TYPE_AFU  1
+#define DFH_TYPE_BBB  2
+#define DFH_TYPE_PRIVATE  3
+#define DFH_EOL(hdr)  (((hdr) >> 40) & 0x1)
+#define DFH_NEXT_OFFSET(hdr)  (((hdr) >> 16) & 0xffffff)
+#define DFH_FEATURE_ID(hdr)  ((hdr) & 0xfff)
+#define PORT_ATTR_REG(n)  (((n) << 3) + 0x38)
+#define PORT_IMPLEMENTED(attr)  (((attr) >> 60) & 0x1)
+#define PORT_BAR(attr)  (((attr) >> 32) & 0x7)
+#define PORT_OFFSET(attr)  ((attr) & 0xffffff)
+#define PORT_FEATURE_UINT_ID  0x12
+#define PORT_UINT_CAP_REG  0x8
+#define PORT_VEC_START(cap)  (((cap) >> 12) & 0xfff)
+#define PORT_VEC_COUNT(cap)  ((cap) >> 12 & 0xfff)
+#define PORT_CTRL_REG  0x38
+#define PORT_SOFT_RESET  (0x1 << 0)
+
+/* NLB registers definition */
+#define CSR_SCRATCHPAD0    0x100
+#define CSR_SCRATCHPAD1    0x108
+#define CSR_AFU_DSM_BASEL  0x110
+#define CSR_AFU_DSM_BASEH  0x114
+#define CSR_SRC_ADDR       0x120
+#define CSR_DST_ADDR       0x128
+#define CSR_NUM_LINES      0x130
+#define CSR_CTL            0x138
+#define CSR_CFG            0x140
+#define CSR_INACT_THRESH   0x148
+#define CSR_INTERRUPT0     0x150
+#define CSR_SWTEST_MSG     0x158
+#define CSR_STATUS0        0x160
+#define CSR_STATUS1        0x168
+#define CSR_ERROR          0x170
+#define CSR_STRIDE         0x178
+#define CSR_HE_INFO0       0x180
+
+#define DSM_SIZE           0x200000
+#define DSM_STATUS         0x40
+#define DSM_POLL_INTERVAL  5  /* ms */
+#define DSM_TIMEOUT        1000  /* ms */
+
+#define NLB_BUF_SIZE  0x400000
+#define TEST_MEM_ALIGN  1024
+
+struct nlb_csr_ctl {
+	union {
+		uint32_t csr;
+		struct {
+			uint32_t reset:1;
+			uint32_t start:1;
+			uint32_t force_completion:1;
+			uint32_t reserved:29;
+		};
+	};
+};
+
+struct nlb_csr_cfg {
+	union {
+		uint32_t csr;
+		struct {
+			uint32_t wrthru_en:1;
+			uint32_t cont:1;
+			uint32_t mode:3;
+			uint32_t multicl_len:2;
+			uint32_t rsvd1:1;
+			uint32_t delay_en:1;
+			uint32_t rdsel:2;
+			uint32_t rsvd2:1;
+			uint32_t chsel:3;
+			uint32_t rsvd3:1;
+			uint32_t wrpush_i:1;
+			uint32_t wr_chsel:3;
+			uint32_t rsvd4:3;
+			uint32_t test_cfg:5;
+			uint32_t interrupt_on_error:1;
+			uint32_t interrupt_testmode:1;
+			uint32_t wrfence_chsel:2;
+		};
+	};
+};
+
+struct nlb_status0 {
+	union {
+		uint64_t csr;
+		struct {
+			uint32_t num_writes;
+			uint32_t num_reads;
+		};
+	};
+};
+
+struct nlb_status1 {
+	union {
+		uint64_t csr;
+		struct {
+			uint32_t num_pend_writes;
+			uint32_t num_pend_reads;
+		};
+	};
+};
+
+struct nlb_dsm_status {
+	uint32_t test_complete;
+	uint32_t test_error;
+	uint64_t num_clocks;
+	uint32_t num_reads;
+	uint32_t num_writes;
+	uint32_t start_overhead;
+	uint32_t end_overhead;
+};
+
+/* DMA registers definition */
+#define DMA_CSR       0x40
+#define DMA_DESC      0x60
+#define DMA_ASE_CTRL  0x200
+#define DMA_ASE_DATA  0x1000
+
+#define DMA_ASE_WINDOW       4096
+#define DMA_ASE_WINDOW_MASK  ((uint64_t)(DMA_ASE_WINDOW - 1))
+#define INVALID_ASE_PAGE     0xffffffffffffffffULL
+
+#define DMA_WF_MAGIC             0x5772745F53796E63ULL
+#define DMA_WF_MAGIC_ROM         0x1000000000000
+#define DMA_HOST_ADDR(addr)      ((addr) | 0x2000000000000)
+#define DMA_WF_HOST_ADDR(addr)   ((addr) | 0x3000000000000)
+
+#define NUM_DMA_BUF   8
+#define HALF_DMA_BUF  (NUM_DMA_BUF / 2)
+
+#define DMA_MASK_32_BIT 0xFFFFFFFF
+
+#define DMA_CSR_BUSY           0x1
+#define DMA_DESC_BUFFER_EMPTY  0x2
+#define DMA_DESC_BUFFER_FULL   0x4
+
+#define DWORD_BYTES 4
+#define IS_ALIGNED_DWORD(addr) (((addr) % DWORD_BYTES) == 0)
+
+#define QWORD_BYTES 8
+#define IS_ALIGNED_QWORD(addr) (((addr) % QWORD_BYTES) == 0)
+
+#define DMA_ALIGN_BYTES 64
+#define IS_DMA_ALIGNED(addr) (((addr) % DMA_ALIGN_BYTES) == 0)
+
+#define CCIP_ALIGN_BYTES (DMA_ALIGN_BYTES << 2)
+
+#define DMA_TIMEOUT_MSEC  5000
+
+#define MAGIC_BUF_SIZE  64
+#define ERR_CHECK_LIMIT  64
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+typedef enum {
+	HOST_TO_FPGA = 0,
+	FPGA_TO_HOST,
+	FPGA_TO_FPGA,
+	FPGA_MAX_TRANSFER_TYPE,
+} fpga_dma_type;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t tx_channel:8;
+		uint32_t generate_sop:1;
+		uint32_t generate_eop:1;
+		uint32_t park_reads:1;
+		uint32_t park_writes:1;
+		uint32_t end_on_eop:1;
+		uint32_t reserved_1:1;
+		uint32_t transfer_irq_en:1;
+		uint32_t early_term_irq_en:1;
+		uint32_t trans_error_irq_en:8;
+		uint32_t early_done_en:1;
+		uint32_t reserved_2:6;
+		uint32_t go:1;
+	};
+} msgdma_desc_ctrl;
+
+typedef struct __rte_packed {
+	uint32_t rd_address;
+	uint32_t wr_address;
+	uint32_t len;
+	uint16_t seq_num;
+	uint8_t rd_burst_count;
+	uint8_t wr_burst_count;
+	uint16_t rd_stride;
+	uint16_t wr_stride;
+	uint32_t rd_address_ext;
+	uint32_t wr_address_ext;
+	msgdma_desc_ctrl control;
+} msgdma_ext_desc;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t busy:1;
+		uint32_t desc_buf_empty:1;
+		uint32_t desc_buf_full:1;
+		uint32_t rsp_buf_empty:1;
+		uint32_t rsp_buf_full:1;
+		uint32_t stopped:1;
+		uint32_t resetting:1;
+		uint32_t stopped_on_error:1;
+		uint32_t stopped_on_early_term:1;
+		uint32_t irq:1;
+		uint32_t reserved:22;
+	};
+} msgdma_status;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t stop_dispatcher:1;
+		uint32_t reset_dispatcher:1;
+		uint32_t stop_on_error:1;
+		uint32_t stopped_on_early_term:1;
+		uint32_t global_intr_en_mask:1;
+		uint32_t stop_descriptors:1;
+		uint32_t reserved:22;
+	};
+} msgdma_ctrl;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t rd_fill_level:16;
+		uint32_t wr_fill_level:16;
+	};
+} msgdma_fill_level;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t rsp_fill_level:16;
+		uint32_t reserved:16;
+	};
+} msgdma_rsp_level;
+
+typedef union {
+	uint32_t csr;
+	struct {
+		uint32_t rd_seq_num:16;
+		uint32_t wr_seq_num:16;
+	};
+} msgdma_seq_num;
+
+typedef struct __rte_packed {
+	msgdma_status status;
+	msgdma_ctrl ctrl;
+	msgdma_fill_level fill_level;
+	msgdma_rsp_level rsp;
+	msgdma_seq_num seq_num;
+} msgdma_csr;
+
+#define CSR_STATUS(csr)   (&(((msgdma_csr *)(csr))->status))
+#define CSR_CONTROL(csr)  (&(((msgdma_csr *)(csr))->ctrl))
+
+struct nlb_afu_ctx {
+	uint8_t *addr;
+	uint8_t *dsm_ptr;
+	uint64_t dsm_iova;
+	uint8_t *src_ptr;
+	uint64_t src_iova;
+	uint8_t *dest_ptr;
+	uint64_t dest_iova;
+	struct nlb_dsm_status *status_ptr;
+};
+
+struct dma_afu_ctx {
+	int index;
+	uint8_t *addr;
+	uint8_t *csr_addr;
+	uint8_t *desc_addr;
+	uint8_t *ase_ctrl_addr;
+	uint8_t *ase_data_addr;
+	uint64_t mem_size;
+	uint64_t cur_ase_page;
+	int event_fd;
+	int verbose;
+	int pattern;
+	void *data_buf;
+	void *ref_buf;
+	msgdma_ext_desc *desc_buf;
+	uint64_t *magic_buf;
+	uint64_t magic_iova;
+	uint32_t dma_buf_size;
+	uint64_t *dma_buf[NUM_DMA_BUF];
+	uint64_t dma_iova[NUM_DMA_BUF];
+};
+
+struct n3000_afu_priv {
+	struct rte_pmd_afu_nlb_cfg nlb_cfg;
+	struct rte_pmd_afu_dma_cfg dma_cfg;
+	struct nlb_afu_ctx nlb_ctx;
+	struct dma_afu_ctx dma_ctx[NUM_N3000_DMA];
+	int num_dma;
+	int cfg_type;
+};
+
+#endif /* _N3000_AFU_H_ */
diff --git a/drivers/raw/afu_mf/rte_pmd_afu.h b/drivers/raw/afu_mf/rte_pmd_afu.h
new file mode 100644
index 0000000..f14a053
--- /dev/null
+++ b/drivers/raw/afu_mf/rte_pmd_afu.h
@@ -0,0 +1,97 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Intel Corporation
+ */
+
+#ifndef __RTE_PMD_AFU_H__
+#define __RTE_PMD_AFU_H__
+
+/**
+ * @file rte_pmd_afu.h
+ *
+ * AFU PMD specific definitions.
+ *
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#define RTE_PMD_AFU_N3000_NLB   1
+#define RTE_PMD_AFU_N3000_DMA   2
+
+#define NLB_MODE_LPBK      0
+#define NLB_MODE_READ      1
+#define NLB_MODE_WRITE     2
+#define NLB_MODE_TRPUT     3
+
+#define NLB_VC_AUTO        0
+#define NLB_VC_VL0         1
+#define NLB_VC_VH0         2
+#define NLB_VC_VH1         3
+#define NLB_VC_RANDOM      4
+
+#define NLB_WRLINE_M       0
+#define NLB_WRLINE_I       1
+#define NLB_WRPUSH_I       2
+
+#define NLB_RDLINE_S       0
+#define NLB_RDLINE_I       1
+#define NLB_RDLINE_MIXED   2
+
+#define MIN_CACHE_LINES   1
+#define MAX_CACHE_LINES   1024
+
+#define MIN_DMA_BUF_SIZE  64
+#define MAX_DMA_BUF_SIZE  (1023 * 1024)
+
+/**
+ * NLB AFU configuration data structure.
+ */
+struct rte_pmd_afu_nlb_cfg {
+	uint32_t mode;
+	uint32_t begin;
+	uint32_t end;
+	uint32_t multi_cl;
+	uint32_t cont;
+	uint32_t timeout;
+	uint32_t cache_policy;
+	uint32_t cache_hint;
+	uint32_t read_vc;
+	uint32_t write_vc;
+	uint32_t wrfence_vc;
+	uint32_t freq_mhz;
+};
+
+/**
+ * DMA AFU configuration data structure.
+ */
+struct rte_pmd_afu_dma_cfg {
+	uint32_t index;     /* index of DMA controller */
+	uint32_t length;    /* total length of data to DMA */
+	uint32_t offset;    /* address offset of target memory */
+	uint32_t size;      /* size of transfer buffer */
+	uint32_t pattern;   /* data pattern to fill in test buffer */
+	uint32_t unaligned; /* use unaligned address or length in sweep test */
+	uint32_t verbose;   /* enable verbose error information in test */
+};
+
+/**
+ * N3000 AFU configuration data structure.
+ */
+struct rte_pmd_afu_n3000_cfg {
+	int type;   /* RTE_PMD_AFU_N3000_NLB or RTE_PMD_AFU_N3000_DMA */
+	union {
+		struct rte_pmd_afu_nlb_cfg nlb_cfg;
+		struct rte_pmd_afu_dma_cfg dma_cfg;
+	};
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RTE_PMD_AFU_H__ */