[v3,09/12] baseband/acc: add FFT support to VRB2 variant

Message ID 20230929163516.3636499-10-nicolas.chautru@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series VRB2 bbdev PMD introduction |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Chautru, Nicolas Sept. 29, 2023, 4:35 p.m. UTC
  Support for the FFT the processing specific to the
VRB2 variant.

Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
---
 drivers/baseband/acc/rte_vrb_pmd.c | 132 ++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 4 deletions(-)
  

Comments

Maxime Coquelin Oct. 3, 2023, 2:36 p.m. UTC | #1
On 9/29/23 18:35, Nicolas Chautru wrote:
> Support for the FFT the processing specific to the
> VRB2 variant.
> 
> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> ---
>   drivers/baseband/acc/rte_vrb_pmd.c | 132 ++++++++++++++++++++++++++++-
>   1 file changed, 128 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c b/drivers/baseband/acc/rte_vrb_pmd.c
> index 93add82947..ce4b90d8e7 100644
> --- a/drivers/baseband/acc/rte_vrb_pmd.c
> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
>   			ACC_FCW_LD_BLEN : (conf->op_type == RTE_BBDEV_OP_FFT ?
>   			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
>   
> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type == RTE_BBDEV_OP_FFT))
> +		fcw_len = ACC_FCW_FFT_BLEN_3;
> +
>   	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
>   		desc = q->ring_addr + desc_idx;
>   		desc->req.word0 = ACC_DMA_DESC_TYPE;
> @@ -1323,6 +1326,24 @@ vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info *dev_info)
>   			.num_buffers_soft_out = 0,
>   			}
>   		},
> +		{
> +			.type	= RTE_BBDEV_OP_FFT,
> +			.cap.fft = {
> +				.capability_flags =
> +						RTE_BBDEV_FFT_WINDOWING |
> +						RTE_BBDEV_FFT_CS_ADJUSTMENT |
> +						RTE_BBDEV_FFT_DFT_BYPASS |
> +						RTE_BBDEV_FFT_IDFT_BYPASS |
> +						RTE_BBDEV_FFT_FP16_INPUT |
> +						RTE_BBDEV_FFT_FP16_OUTPUT |
> +						RTE_BBDEV_FFT_POWER_MEAS |
> +						RTE_BBDEV_FFT_WINDOWING_BYPASS,
> +				.num_buffers_src =
> +						1,
> +				.num_buffers_dst =
> +						1,
> +			}
> +		},
>   		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
>   	};
>   
> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct acc_fcw_fft *fcw)
>   		fcw->bypass = 0;
>   }
>   
> +/* Fill in a frame control word for FFT processing. */
> +static inline void
> +vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct acc_fcw_fft_3 *fcw)
> +{
> +	fcw->in_frame_size = op->fft.input_sequence_size;
> +	fcw->leading_pad_size = op->fft.input_leading_padding;
> +	fcw->out_frame_size = op->fft.output_sequence_size;
> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
> +	fcw->cs_window_sel = op->fft.window_index[0] +
> +			(op->fft.window_index[1] << 8) +
> +			(op->fft.window_index[2] << 16) +
> +			(op->fft.window_index[3] << 24);
> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
> +			(op->fft.window_index[5] << 8);
> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
> +	fcw->num_antennas = op->fft.num_antennas_log2;
> +	fcw->idft_size = op->fft.idft_log2;
> +	fcw->dft_size = op->fft.dft_log2;
> +	fcw->cs_offset = op->fft.cs_time_adjustment;
> +	fcw->idft_shift = op->fft.idft_shift;
> +	fcw->dft_shift = op->fft.dft_shift;
> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op->fft.fp16_exp_adjust;
> +	fcw->fp16_in = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_FP16_INPUT);
> +	fcw->fp16_out = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_FP16_OUTPUT);
> +	fcw->power_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_POWER_MEAS);
> +	if (check_bit(op->fft.op_flags,
> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
> +		if (check_bit(op->fft.op_flags,
> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
> +			fcw->bypass = 2;
> +		else
> +			fcw->bypass = 1;
> +	} else if (check_bit(op->fft.op_flags,
> +			RTE_BBDEV_FFT_DFT_BYPASS))
> +		fcw->bypass = 3;
> +	else
> +		fcw->bypass = 0;

The only difference I see with VRB1 are backed by corresponding op_flags
(POWER & FP16), is that correct? If so, it does not make sense to me to
have a specific fucntion for VRB2.

> +}
> +
>   static inline int
>   vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>   		struct acc_dma_req_desc *desc,
> @@ -3882,6 +3944,58 @@ vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>   	return 0;
>   }
>   
> +static inline int
> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> +		struct acc_dma_req_desc *desc,
> +		struct rte_mbuf *input, struct rte_mbuf *output, struct rte_mbuf *win_input,
> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t *out_offset,
> +		uint32_t *win_offset, uint32_t *pwr_offset)
> +{
> +	bool pwr_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_POWER_MEAS);
> +	bool win_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_DEWINDOWING);
> +	int num_cs = 0, i, bd_idx = 1;
> +
> +	/* FCW already done */
> +	acc_header_init(desc);
> +
> +	RTE_SET_USED(win_input);
> +	RTE_SET_USED(win_offset);
> +
> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input, *in_offset);
> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size * ACC_IQ_SIZE;
> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
> +	desc->data_ptrs[bd_idx].last = 1;
> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> +	bd_idx++;
> +
> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output, *out_offset);
> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size * ACC_IQ_SIZE;
> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> +	desc->m2dlen = win_en ? 3 : 2;
> +	desc->d2mlen = pwr_en ? 2 : 1;
> +	desc->ib_ant_offset = op->fft.input_sequence_size;
> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
> +
> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
> +			num_cs++;
> +	desc->num_cs = num_cs;
> +
> +	if (pwr_en && pwr) {
> +		bd_idx++;
> +		desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(pwr, *pwr_offset);
> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op->fft.num_antennas_log2) * 4;
> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
> +		desc->data_ptrs[bd_idx].last = 1;
> +		desc->data_ptrs[bd_idx].dma_ext = 0;
> +	}
> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
> +	desc->op_addr = op;
> +	return 0;
> +}
>   
>   /** Enqueue one FFT operation for device. */
>   static inline int
> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue *q, struct rte_bbdev_fft_op *op,
>   		uint16_t total_enqueued_cbs)
>   {
>   	union acc_dma_desc *desc;
> -	struct rte_mbuf *input, *output;
> -	uint32_t in_offset, out_offset;
> +	struct rte_mbuf *input, *output, *pwr, *win;
> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
>   	struct acc_fcw_fft *fcw;
>   
>   	desc = acc_desc(q, total_enqueued_cbs);
>   	input = op->fft.base_input.data;
>   	output = op->fft.base_output.data;
> +	pwr = op->fft.power_meas_output.data;
> +	win = op->fft.dewindowing_input.data;
>   	in_offset = op->fft.base_input.offset;
>   	out_offset = op->fft.base_output.offset;
> +	pwr_offset = op->fft.power_meas_output.offset;
> +	win_offset = op->fft.dewindowing_input.offset;
>   
>   	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
>   			((q->sw_ring_head + total_enqueued_cbs) & q->sw_ring_wrap_mask)
>   			* ACC_MAX_FCW_SIZE);
>   
> -	vrb1_fcw_fft_fill(op, fcw);
> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);
> +	if (q->d->device_variant == VRB1_VARIANT) {
> +		vrb1_fcw_fft_fill(op, fcw);
> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);
> +	} else {
> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win, pwr,
> +				&in_offset, &out_offset, &win_offset, &pwr_offset);
> +	}
>   #ifdef RTE_LIBRTE_BBDEV_DEBUG
>   	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
>   			sizeof(desc->req.fcw_fft));
  
Chautru, Nicolas Oct. 3, 2023, 6:20 p.m. UTC | #2
Hi Maxime, 

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, October 3, 2023 7:37 AM
> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
> <hernan.vargas@intel.com>
> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
> 
> 
> 
> On 9/29/23 18:35, Nicolas Chautru wrote:
> > Support for the FFT the processing specific to the
> > VRB2 variant.
> >
> > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > ---
> >   drivers/baseband/acc/rte_vrb_pmd.c | 132
> ++++++++++++++++++++++++++++-
> >   1 file changed, 128 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
> > b/drivers/baseband/acc/rte_vrb_pmd.c
> > index 93add82947..ce4b90d8e7 100644
> > --- a/drivers/baseband/acc/rte_vrb_pmd.c
> > +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> > @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t
> queue_id,
> >   			ACC_FCW_LD_BLEN : (conf->op_type ==
> RTE_BBDEV_OP_FFT ?
> >   			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
> >
> > +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
> RTE_BBDEV_OP_FFT))
> > +		fcw_len = ACC_FCW_FFT_BLEN_3;
> > +
> >   	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> >   		desc = q->ring_addr + desc_idx;
> >   		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -1323,6
> +1326,24 @@
> > vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info
> *dev_info)
> >   			.num_buffers_soft_out = 0,
> >   			}
> >   		},
> > +		{
> > +			.type	= RTE_BBDEV_OP_FFT,
> > +			.cap.fft = {
> > +				.capability_flags =
> > +
> 	RTE_BBDEV_FFT_WINDOWING |
> > +
> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
> > +
> 	RTE_BBDEV_FFT_DFT_BYPASS |
> > +
> 	RTE_BBDEV_FFT_IDFT_BYPASS |
> > +						RTE_BBDEV_FFT_FP16_INPUT
> |
> > +
> 	RTE_BBDEV_FFT_FP16_OUTPUT |
> > +
> 	RTE_BBDEV_FFT_POWER_MEAS |
> > +
> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
> > +				.num_buffers_src =
> > +						1,
> > +				.num_buffers_dst =
> > +						1,
> > +			}
> > +		},
> >   		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
> >   	};
> >
> > @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op *op,
> struct acc_fcw_fft *fcw)
> >   		fcw->bypass = 0;
> >   }
> >
> > +/* Fill in a frame control word for FFT processing. */ static inline
> > +void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
> > +acc_fcw_fft_3 *fcw) {
> > +	fcw->in_frame_size = op->fft.input_sequence_size;
> > +	fcw->leading_pad_size = op->fft.input_leading_padding;
> > +	fcw->out_frame_size = op->fft.output_sequence_size;
> > +	fcw->leading_depad_size = op->fft.output_leading_depadding;
> > +	fcw->cs_window_sel = op->fft.window_index[0] +
> > +			(op->fft.window_index[1] << 8) +
> > +			(op->fft.window_index[2] << 16) +
> > +			(op->fft.window_index[3] << 24);
> > +	fcw->cs_window_sel2 = op->fft.window_index[4] +
> > +			(op->fft.window_index[5] << 8);
> > +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
> > +	fcw->num_antennas = op->fft.num_antennas_log2;
> > +	fcw->idft_size = op->fft.idft_log2;
> > +	fcw->dft_size = op->fft.dft_log2;
> > +	fcw->cs_offset = op->fft.cs_time_adjustment;
> > +	fcw->idft_shift = op->fft.idft_shift;
> > +	fcw->dft_shift = op->fft.dft_shift;
> > +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
> > +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
> >fft.fp16_exp_adjust;
> > +	fcw->fp16_in = check_bit(op->fft.op_flags,
> RTE_BBDEV_FFT_FP16_INPUT);
> > +	fcw->fp16_out = check_bit(op->fft.op_flags,
> RTE_BBDEV_FFT_FP16_OUTPUT);
> > +	fcw->power_en = check_bit(op->fft.op_flags,
> RTE_BBDEV_FFT_POWER_MEAS);
> > +	if (check_bit(op->fft.op_flags,
> > +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
> > +		if (check_bit(op->fft.op_flags,
> > +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
> > +			fcw->bypass = 2;
> > +		else
> > +			fcw->bypass = 1;
> > +	} else if (check_bit(op->fft.op_flags,
> > +			RTE_BBDEV_FFT_DFT_BYPASS))
> > +		fcw->bypass = 3;
> > +	else
> > +		fcw->bypass = 0;
> 
> The only difference I see with VRB1 are backed by corresponding op_flags
> (POWER & FP16), is that correct? If so, it does not make sense to me to have a
> specific function for VRB2.

There are more changes but these are only formally enabled in the next stepping hence some of the
related code is not included yet. More generally the FCW and IP is different from VRB1 implementation. 

> 
> > +}
> > +
> >   static inline int
> >   vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >   		struct acc_dma_req_desc *desc,
> > @@ -3882,6 +3944,58 @@ vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op
> *op,
> >   	return 0;
> >   }
> >
> > +static inline int
> > +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> > +		struct acc_dma_req_desc *desc,
> > +		struct rte_mbuf *input, struct rte_mbuf *output, struct
> rte_mbuf *win_input,
> > +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
> *out_offset,
> > +		uint32_t *win_offset, uint32_t *pwr_offset) {
> > +	bool pwr_en = check_bit(op->fft.op_flags,
> RTE_BBDEV_FFT_POWER_MEAS);
> > +	bool win_en = check_bit(op->fft.op_flags,
> RTE_BBDEV_FFT_DEWINDOWING);
> > +	int num_cs = 0, i, bd_idx = 1;
> > +
> > +	/* FCW already done */
> > +	acc_header_init(desc);
> > +
> > +	RTE_SET_USED(win_input);
> > +	RTE_SET_USED(win_offset);
> > +
> > +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
> *in_offset);
> > +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
> ACC_IQ_SIZE;
> > +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
> > +	desc->data_ptrs[bd_idx].last = 1;
> > +	desc->data_ptrs[bd_idx].dma_ext = 0;
> > +	bd_idx++;
> > +
> > +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output,
> *out_offset);
> > +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
> ACC_IQ_SIZE;
> > +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
> > +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
> > +	desc->data_ptrs[bd_idx].dma_ext = 0;
> > +	desc->m2dlen = win_en ? 3 : 2;
> > +	desc->d2mlen = pwr_en ? 2 : 1;
> > +	desc->ib_ant_offset = op->fft.input_sequence_size;
> > +	desc->num_ant = op->fft.num_antennas_log2 - 3;
> > +
> > +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
> > +		if (check_bit(op->fft.cs_bitmap, 1 << i))
> > +			num_cs++;
> > +	desc->num_cs = num_cs;
> > +
> > +	if (pwr_en && pwr) {
> > +		bd_idx++;
> > +		desc->data_ptrs[bd_idx].address =
> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
> > +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
> >fft.num_antennas_log2) * 4;
> > +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
> > +		desc->data_ptrs[bd_idx].last = 1;
> > +		desc->data_ptrs[bd_idx].dma_ext = 0;
> > +	}
> > +	desc->ob_cyc_offset = op->fft.output_sequence_size;
> > +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
> > +	desc->op_addr = op;
> > +	return 0;
> > +}
> >
> >   /** Enqueue one FFT operation for device. */
> >   static inline int
> > @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue *q,
> struct rte_bbdev_fft_op *op,
> >   		uint16_t total_enqueued_cbs)
> >   {
> >   	union acc_dma_desc *desc;
> > -	struct rte_mbuf *input, *output;
> > -	uint32_t in_offset, out_offset;
> > +	struct rte_mbuf *input, *output, *pwr, *win;
> > +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
> >   	struct acc_fcw_fft *fcw;
> >
> >   	desc = acc_desc(q, total_enqueued_cbs);
> >   	input = op->fft.base_input.data;
> >   	output = op->fft.base_output.data;
> > +	pwr = op->fft.power_meas_output.data;
> > +	win = op->fft.dewindowing_input.data;
> >   	in_offset = op->fft.base_input.offset;
> >   	out_offset = op->fft.base_output.offset;
> > +	pwr_offset = op->fft.power_meas_output.offset;
> > +	win_offset = op->fft.dewindowing_input.offset;
> >
> >   	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> >   			((q->sw_ring_head + total_enqueued_cbs) & q-
> >sw_ring_wrap_mask)
> >   			* ACC_MAX_FCW_SIZE);
> >
> > -	vrb1_fcw_fft_fill(op, fcw);
> > -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
> &out_offset);
> > +	if (q->d->device_variant == VRB1_VARIANT) {
> > +		vrb1_fcw_fft_fill(op, fcw);
> > +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
> &in_offset, &out_offset);
> > +	} else {
> > +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
> > +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
> pwr,
> > +				&in_offset, &out_offset, &win_offset,
> &pwr_offset);
> > +	}
> >   #ifdef RTE_LIBRTE_BBDEV_DEBUG
> >   	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
> >   			sizeof(desc->req.fcw_fft));
  
Maxime Coquelin Oct. 4, 2023, 7:11 a.m. UTC | #3
On 10/3/23 20:20, Chautru, Nicolas wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, October 3, 2023 7:37 AM
>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
>> <hernan.vargas@intel.com>
>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
>>
>>
>>
>> On 9/29/23 18:35, Nicolas Chautru wrote:
>>> Support for the FFT the processing specific to the
>>> VRB2 variant.
>>>
>>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
>>> ---
>>>    drivers/baseband/acc/rte_vrb_pmd.c | 132
>> ++++++++++++++++++++++++++++-
>>>    1 file changed, 128 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
>>> b/drivers/baseband/acc/rte_vrb_pmd.c
>>> index 93add82947..ce4b90d8e7 100644
>>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
>>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
>>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t
>> queue_id,
>>>    			ACC_FCW_LD_BLEN : (conf->op_type ==
>> RTE_BBDEV_OP_FFT ?
>>>    			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
>>>
>>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
>> RTE_BBDEV_OP_FFT))
>>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
>>> +
>>>    	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
>>>    		desc = q->ring_addr + desc_idx;
>>>    		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -1323,6
>> +1326,24 @@
>>> vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info
>> *dev_info)
>>>    			.num_buffers_soft_out = 0,
>>>    			}
>>>    		},
>>> +		{
>>> +			.type	= RTE_BBDEV_OP_FFT,
>>> +			.cap.fft = {
>>> +				.capability_flags =
>>> +
>> 	RTE_BBDEV_FFT_WINDOWING |
>>> +
>> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
>>> +
>> 	RTE_BBDEV_FFT_DFT_BYPASS |
>>> +
>> 	RTE_BBDEV_FFT_IDFT_BYPASS |
>>> +						RTE_BBDEV_FFT_FP16_INPUT
>> |
>>> +
>> 	RTE_BBDEV_FFT_FP16_OUTPUT |
>>> +
>> 	RTE_BBDEV_FFT_POWER_MEAS |
>>> +
>> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
>>> +				.num_buffers_src =
>>> +						1,
>>> +				.num_buffers_dst =
>>> +						1,
>>> +			}
>>> +		},
>>>    		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
>>>    	};
>>>
>>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op *op,
>> struct acc_fcw_fft *fcw)
>>>    		fcw->bypass = 0;
>>>    }
>>>
>>> +/* Fill in a frame control word for FFT processing. */ static inline
>>> +void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
>>> +acc_fcw_fft_3 *fcw) {
>>> +	fcw->in_frame_size = op->fft.input_sequence_size;
>>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
>>> +	fcw->out_frame_size = op->fft.output_sequence_size;
>>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
>>> +	fcw->cs_window_sel = op->fft.window_index[0] +
>>> +			(op->fft.window_index[1] << 8) +
>>> +			(op->fft.window_index[2] << 16) +
>>> +			(op->fft.window_index[3] << 24);
>>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
>>> +			(op->fft.window_index[5] << 8);
>>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
>>> +	fcw->num_antennas = op->fft.num_antennas_log2;
>>> +	fcw->idft_size = op->fft.idft_log2;
>>> +	fcw->dft_size = op->fft.dft_log2;
>>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
>>> +	fcw->idft_shift = op->fft.idft_shift;
>>> +	fcw->dft_shift = op->fft.dft_shift;
>>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
>>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
>>> fft.fp16_exp_adjust;
>>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
>> RTE_BBDEV_FFT_FP16_INPUT);
>>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
>> RTE_BBDEV_FFT_FP16_OUTPUT);
>>> +	fcw->power_en = check_bit(op->fft.op_flags,
>> RTE_BBDEV_FFT_POWER_MEAS);
>>> +	if (check_bit(op->fft.op_flags,
>>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
>>> +		if (check_bit(op->fft.op_flags,
>>> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
>>> +			fcw->bypass = 2;
>>> +		else
>>> +			fcw->bypass = 1;
>>> +	} else if (check_bit(op->fft.op_flags,
>>> +			RTE_BBDEV_FFT_DFT_BYPASS))
>>> +		fcw->bypass = 3;
>>> +	else
>>> +		fcw->bypass = 0;
>>
>> The only difference I see with VRB1 are backed by corresponding op_flags
>> (POWER & FP16), is that correct? If so, it does not make sense to me to have a
>> specific function for VRB2.
> 
> There are more changes but these are only formally enabled in the next stepping hence some of the
> related code is not included yet. More generally the FCW and IP is different from VRB1 implementation.

Currently, the code is almost identical so vrb1 implementation should be
reused. If some later changes makes the two implementations diverge,
then we can consider having a dedicated function for VRB2 at that time.

>>
>>> +}
>>> +
>>>    static inline int
>>>    vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>>    		struct acc_dma_req_desc *desc,
>>> @@ -3882,6 +3944,58 @@ vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op
>> *op,
>>>    	return 0;
>>>    }
>>>
>>> +static inline int
>>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>> +		struct acc_dma_req_desc *desc,
>>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
>> rte_mbuf *win_input,
>>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
>> *out_offset,
>>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
>>> +	bool pwr_en = check_bit(op->fft.op_flags,
>> RTE_BBDEV_FFT_POWER_MEAS);
>>> +	bool win_en = check_bit(op->fft.op_flags,
>> RTE_BBDEV_FFT_DEWINDOWING);
>>> +	int num_cs = 0, i, bd_idx = 1;
>>> +
>>> +	/* FCW already done */
>>> +	acc_header_init(desc);
>>> +
>>> +	RTE_SET_USED(win_input);
>>> +	RTE_SET_USED(win_offset);
>>> +
>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
>> *in_offset);
>>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
>> ACC_IQ_SIZE;
>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
>>> +	desc->data_ptrs[bd_idx].last = 1;
>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>> +	bd_idx++;
>>> +
>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output,
>> *out_offset);
>>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
>> ACC_IQ_SIZE;
>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
>>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>> +	desc->m2dlen = win_en ? 3 : 2;
>>> +	desc->d2mlen = pwr_en ? 2 : 1;
>>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
>>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
>>> +
>>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
>>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
>>> +			num_cs++;
>>> +	desc->num_cs = num_cs;
>>> +
>>> +	if (pwr_en && pwr) {
>>> +		bd_idx++;
>>> +		desc->data_ptrs[bd_idx].address =
>> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
>>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
>>> fft.num_antennas_log2) * 4;
>>> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
>>> +		desc->data_ptrs[bd_idx].last = 1;
>>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
>>> +	}
>>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
>>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
>>> +	desc->op_addr = op;
>>> +	return 0;
>>> +}
>>>
>>>    /** Enqueue one FFT operation for device. */
>>>    static inline int
>>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue *q,
>> struct rte_bbdev_fft_op *op,
>>>    		uint16_t total_enqueued_cbs)
>>>    {
>>>    	union acc_dma_desc *desc;
>>> -	struct rte_mbuf *input, *output;
>>> -	uint32_t in_offset, out_offset;
>>> +	struct rte_mbuf *input, *output, *pwr, *win;
>>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
>>>    	struct acc_fcw_fft *fcw;
>>>
>>>    	desc = acc_desc(q, total_enqueued_cbs);
>>>    	input = op->fft.base_input.data;
>>>    	output = op->fft.base_output.data;
>>> +	pwr = op->fft.power_meas_output.data;
>>> +	win = op->fft.dewindowing_input.data;
>>>    	in_offset = op->fft.base_input.offset;
>>>    	out_offset = op->fft.base_output.offset;
>>> +	pwr_offset = op->fft.power_meas_output.offset;
>>> +	win_offset = op->fft.dewindowing_input.offset;
>>>
>>>    	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
>>>    			((q->sw_ring_head + total_enqueued_cbs) & q-
>>> sw_ring_wrap_mask)
>>>    			* ACC_MAX_FCW_SIZE);
>>>
>>> -	vrb1_fcw_fft_fill(op, fcw);
>>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
>> &out_offset);
>>> +	if (q->d->device_variant == VRB1_VARIANT) {
>>> +		vrb1_fcw_fft_fill(op, fcw);
>>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
>> &in_offset, &out_offset);
>>> +	} else {
>>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
>>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
>> pwr,
>>> +				&in_offset, &out_offset, &win_offset,
>> &pwr_offset);
>>> +	}
>>>    #ifdef RTE_LIBRTE_BBDEV_DEBUG
>>>    	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
>>>    			sizeof(desc->req.fcw_fft));
>
  
Chautru, Nicolas Oct. 4, 2023, 9:18 p.m. UTC | #4
Hi Maxime, 

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Wednesday, October 4, 2023 12:11 AM
> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
> <hernan.vargas@intel.com>
> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
> 
> 
> 
> On 10/3/23 20:20, Chautru, Nicolas wrote:
> > Hi Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, October 3, 2023 7:37 AM
> >> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> Hernan
> >> <hernan.vargas@intel.com>
> >> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
> >> variant
> >>
> >>
> >>
> >> On 9/29/23 18:35, Nicolas Chautru wrote:
> >>> Support for the FFT the processing specific to the
> >>> VRB2 variant.
> >>>
> >>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> >>> ---
> >>>    drivers/baseband/acc/rte_vrb_pmd.c | 132
> >> ++++++++++++++++++++++++++++-
> >>>    1 file changed, 128 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
> >>> b/drivers/baseband/acc/rte_vrb_pmd.c
> >>> index 93add82947..ce4b90d8e7 100644
> >>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
> >>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> >>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t
> >> queue_id,
> >>>    			ACC_FCW_LD_BLEN : (conf->op_type ==
> >> RTE_BBDEV_OP_FFT ?
> >>>    			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
> >>>
> >>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
> >> RTE_BBDEV_OP_FFT))
> >>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
> >>> +
> >>>    	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> >>>    		desc = q->ring_addr + desc_idx;
> >>>    		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -1323,6
> >> +1326,24 @@
> >>> vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info
> >> *dev_info)
> >>>    			.num_buffers_soft_out = 0,
> >>>    			}
> >>>    		},
> >>> +		{
> >>> +			.type	= RTE_BBDEV_OP_FFT,
> >>> +			.cap.fft = {
> >>> +				.capability_flags =
> >>> +
> >> 	RTE_BBDEV_FFT_WINDOWING |
> >>> +
> >> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
> >>> +
> >> 	RTE_BBDEV_FFT_DFT_BYPASS |
> >>> +
> >> 	RTE_BBDEV_FFT_IDFT_BYPASS |
> >>> +						RTE_BBDEV_FFT_FP16_INPUT
> >> |
> >>> +
> >> 	RTE_BBDEV_FFT_FP16_OUTPUT |
> >>> +
> >> 	RTE_BBDEV_FFT_POWER_MEAS |
> >>> +
> >> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
> >>> +				.num_buffers_src =
> >>> +						1,
> >>> +				.num_buffers_dst =
> >>> +						1,
> >>> +			}
> >>> +		},
> >>>    		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
> >>>    	};
> >>>
> >>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op
> >>> *op,
> >> struct acc_fcw_fft *fcw)
> >>>    		fcw->bypass = 0;
> >>>    }
> >>>
> >>> +/* Fill in a frame control word for FFT processing. */ static
> >>> +inline void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
> >>> +acc_fcw_fft_3 *fcw) {
> >>> +	fcw->in_frame_size = op->fft.input_sequence_size;
> >>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
> >>> +	fcw->out_frame_size = op->fft.output_sequence_size;
> >>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
> >>> +	fcw->cs_window_sel = op->fft.window_index[0] +
> >>> +			(op->fft.window_index[1] << 8) +
> >>> +			(op->fft.window_index[2] << 16) +
> >>> +			(op->fft.window_index[3] << 24);
> >>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
> >>> +			(op->fft.window_index[5] << 8);
> >>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
> >>> +	fcw->num_antennas = op->fft.num_antennas_log2;
> >>> +	fcw->idft_size = op->fft.idft_log2;
> >>> +	fcw->dft_size = op->fft.dft_log2;
> >>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
> >>> +	fcw->idft_shift = op->fft.idft_shift;
> >>> +	fcw->dft_shift = op->fft.dft_shift;
> >>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
> >>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
> >>> fft.fp16_exp_adjust;
> >>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
> >> RTE_BBDEV_FFT_FP16_INPUT);
> >>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
> >> RTE_BBDEV_FFT_FP16_OUTPUT);
> >>> +	fcw->power_en = check_bit(op->fft.op_flags,
> >> RTE_BBDEV_FFT_POWER_MEAS);
> >>> +	if (check_bit(op->fft.op_flags,
> >>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
> >>> +		if (check_bit(op->fft.op_flags,
> >>> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
> >>> +			fcw->bypass = 2;
> >>> +		else
> >>> +			fcw->bypass = 1;
> >>> +	} else if (check_bit(op->fft.op_flags,
> >>> +			RTE_BBDEV_FFT_DFT_BYPASS))
> >>> +		fcw->bypass = 3;
> >>> +	else
> >>> +		fcw->bypass = 0;
> >>
> >> The only difference I see with VRB1 are backed by corresponding
> >> op_flags (POWER & FP16), is that correct? If so, it does not make
> >> sense to me to have a specific function for VRB2.
> >
> > There are more changes but these are only formally enabled in the next
> > stepping hence some of the related code is not included yet. More generally
> the FCW and IP is different from VRB1 implementation.
> 
> Currently, the code is almost identical so vrb1 implementation should be
> reused. If some later changes makes the two implementations diverge, then we
> can consider having a dedicated function for VRB2 at that time.

If I may, I believe this is best as-is notably for patches and support. 
The functions are fairly small (not much code overlap quantitatively) and the underlying IP is different 
(with more differences we can enable over time). I don’t think it would help anyone really to try to make them
coexist for a small period of time. 
Does that sound fair? 


> 
> >>
> >>> +}
> >>> +
> >>>    static inline int
> >>>    vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>>    		struct acc_dma_req_desc *desc,
> >>> @@ -3882,6 +3944,58 @@ vrb1_dma_desc_fft_fill(struct
> >>> rte_bbdev_fft_op
> >> *op,
> >>>    	return 0;
> >>>    }
> >>>
> >>> +static inline int
> >>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>> +		struct acc_dma_req_desc *desc,
> >>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
> >> rte_mbuf *win_input,
> >>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
> >> *out_offset,
> >>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
> >>> +	bool pwr_en = check_bit(op->fft.op_flags,
> >> RTE_BBDEV_FFT_POWER_MEAS);
> >>> +	bool win_en = check_bit(op->fft.op_flags,
> >> RTE_BBDEV_FFT_DEWINDOWING);
> >>> +	int num_cs = 0, i, bd_idx = 1;
> >>> +
> >>> +	/* FCW already done */
> >>> +	acc_header_init(desc);
> >>> +
> >>> +	RTE_SET_USED(win_input);
> >>> +	RTE_SET_USED(win_offset);
> >>> +
> >>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
> >> *in_offset);
> >>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
> >> ACC_IQ_SIZE;
> >>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
> >>> +	desc->data_ptrs[bd_idx].last = 1;
> >>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>> +	bd_idx++;
> >>> +
> >>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output,
> >> *out_offset);
> >>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
> >> ACC_IQ_SIZE;
> >>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
> >>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
> >>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>> +	desc->m2dlen = win_en ? 3 : 2;
> >>> +	desc->d2mlen = pwr_en ? 2 : 1;
> >>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
> >>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
> >>> +
> >>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
> >>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
> >>> +			num_cs++;
> >>> +	desc->num_cs = num_cs;
> >>> +
> >>> +	if (pwr_en && pwr) {
> >>> +		bd_idx++;
> >>> +		desc->data_ptrs[bd_idx].address =
> >> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
> >>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
> >>> fft.num_antennas_log2) * 4;
> >>> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
> >>> +		desc->data_ptrs[bd_idx].last = 1;
> >>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
> >>> +	}
> >>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
> >>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
> >>> +	desc->op_addr = op;
> >>> +	return 0;
> >>> +}
> >>>
> >>>    /** Enqueue one FFT operation for device. */
> >>>    static inline int
> >>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue
> *q,
> >> struct rte_bbdev_fft_op *op,
> >>>    		uint16_t total_enqueued_cbs)
> >>>    {
> >>>    	union acc_dma_desc *desc;
> >>> -	struct rte_mbuf *input, *output;
> >>> -	uint32_t in_offset, out_offset;
> >>> +	struct rte_mbuf *input, *output, *pwr, *win;
> >>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
> >>>    	struct acc_fcw_fft *fcw;
> >>>
> >>>    	desc = acc_desc(q, total_enqueued_cbs);
> >>>    	input = op->fft.base_input.data;
> >>>    	output = op->fft.base_output.data;
> >>> +	pwr = op->fft.power_meas_output.data;
> >>> +	win = op->fft.dewindowing_input.data;
> >>>    	in_offset = op->fft.base_input.offset;
> >>>    	out_offset = op->fft.base_output.offset;
> >>> +	pwr_offset = op->fft.power_meas_output.offset;
> >>> +	win_offset = op->fft.dewindowing_input.offset;
> >>>
> >>>    	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> >>>    			((q->sw_ring_head + total_enqueued_cbs) & q-
> >>> sw_ring_wrap_mask)
> >>>    			* ACC_MAX_FCW_SIZE);
> >>>
> >>> -	vrb1_fcw_fft_fill(op, fcw);
> >>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
> >> &out_offset);
> >>> +	if (q->d->device_variant == VRB1_VARIANT) {
> >>> +		vrb1_fcw_fft_fill(op, fcw);
> >>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
> >> &in_offset, &out_offset);
> >>> +	} else {
> >>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
> >>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
> >> pwr,
> >>> +				&in_offset, &out_offset, &win_offset,
> >> &pwr_offset);
> >>> +	}
> >>>    #ifdef RTE_LIBRTE_BBDEV_DEBUG
> >>>    	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
> >>>    			sizeof(desc->req.fcw_fft));
> >
  
Maxime Coquelin Oct. 5, 2023, 2:34 p.m. UTC | #5
On 10/4/23 23:18, Chautru, Nicolas wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Wednesday, October 4, 2023 12:11 AM
>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
>> <hernan.vargas@intel.com>
>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
>>
>>
>>
>> On 10/3/23 20:20, Chautru, Nicolas wrote:
>>> Hi Maxime,
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Tuesday, October 3, 2023 7:37 AM
>>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
>> Hernan
>>>> <hernan.vargas@intel.com>
>>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
>>>> variant
>>>>
>>>>
>>>>
>>>> On 9/29/23 18:35, Nicolas Chautru wrote:
>>>>> Support for the FFT the processing specific to the
>>>>> VRB2 variant.
>>>>>
>>>>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
>>>>> ---
>>>>>     drivers/baseband/acc/rte_vrb_pmd.c | 132
>>>> ++++++++++++++++++++++++++++-
>>>>>     1 file changed, 128 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
>>>>> b/drivers/baseband/acc/rte_vrb_pmd.c
>>>>> index 93add82947..ce4b90d8e7 100644
>>>>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
>>>>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
>>>>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t
>>>> queue_id,
>>>>>     			ACC_FCW_LD_BLEN : (conf->op_type ==
>>>> RTE_BBDEV_OP_FFT ?
>>>>>     			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
>>>>>
>>>>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
>>>> RTE_BBDEV_OP_FFT))
>>>>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
>>>>> +
>>>>>     	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
>>>>>     		desc = q->ring_addr + desc_idx;
>>>>>     		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -1323,6
>>>> +1326,24 @@
>>>>> vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info
>>>> *dev_info)
>>>>>     			.num_buffers_soft_out = 0,
>>>>>     			}
>>>>>     		},
>>>>> +		{
>>>>> +			.type	= RTE_BBDEV_OP_FFT,
>>>>> +			.cap.fft = {
>>>>> +				.capability_flags =
>>>>> +
>>>> 	RTE_BBDEV_FFT_WINDOWING |
>>>>> +
>>>> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
>>>>> +
>>>> 	RTE_BBDEV_FFT_DFT_BYPASS |
>>>>> +
>>>> 	RTE_BBDEV_FFT_IDFT_BYPASS |
>>>>> +						RTE_BBDEV_FFT_FP16_INPUT
>>>> |
>>>>> +
>>>> 	RTE_BBDEV_FFT_FP16_OUTPUT |
>>>>> +
>>>> 	RTE_BBDEV_FFT_POWER_MEAS |
>>>>> +
>>>> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
>>>>> +				.num_buffers_src =
>>>>> +						1,
>>>>> +				.num_buffers_dst =
>>>>> +						1,
>>>>> +			}
>>>>> +		},
>>>>>     		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
>>>>>     	};
>>>>>
>>>>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op
>>>>> *op,
>>>> struct acc_fcw_fft *fcw)
>>>>>     		fcw->bypass = 0;
>>>>>     }
>>>>>
>>>>> +/* Fill in a frame control word for FFT processing. */ static
>>>>> +inline void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
>>>>> +acc_fcw_fft_3 *fcw) {
>>>>> +	fcw->in_frame_size = op->fft.input_sequence_size;
>>>>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
>>>>> +	fcw->out_frame_size = op->fft.output_sequence_size;
>>>>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
>>>>> +	fcw->cs_window_sel = op->fft.window_index[0] +
>>>>> +			(op->fft.window_index[1] << 8) +
>>>>> +			(op->fft.window_index[2] << 16) +
>>>>> +			(op->fft.window_index[3] << 24);
>>>>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
>>>>> +			(op->fft.window_index[5] << 8);
>>>>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
>>>>> +	fcw->num_antennas = op->fft.num_antennas_log2;
>>>>> +	fcw->idft_size = op->fft.idft_log2;
>>>>> +	fcw->dft_size = op->fft.dft_log2;
>>>>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
>>>>> +	fcw->idft_shift = op->fft.idft_shift;
>>>>> +	fcw->dft_shift = op->fft.dft_shift;
>>>>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
>>>>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
>>>>> fft.fp16_exp_adjust;
>>>>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
>>>> RTE_BBDEV_FFT_FP16_INPUT);
>>>>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
>>>> RTE_BBDEV_FFT_FP16_OUTPUT);
>>>>> +	fcw->power_en = check_bit(op->fft.op_flags,
>>>> RTE_BBDEV_FFT_POWER_MEAS);
>>>>> +	if (check_bit(op->fft.op_flags,
>>>>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
>>>>> +		if (check_bit(op->fft.op_flags,
>>>>> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
>>>>> +			fcw->bypass = 2;
>>>>> +		else
>>>>> +			fcw->bypass = 1;
>>>>> +	} else if (check_bit(op->fft.op_flags,
>>>>> +			RTE_BBDEV_FFT_DFT_BYPASS))
>>>>> +		fcw->bypass = 3;
>>>>> +	else
>>>>> +		fcw->bypass = 0;
>>>>
>>>> The only difference I see with VRB1 are backed by corresponding
>>>> op_flags (POWER & FP16), is that correct? If so, it does not make
>>>> sense to me to have a specific function for VRB2.
>>>
>>> There are more changes but these are only formally enabled in the next
>>> stepping hence some of the related code is not included yet. More generally
>> the FCW and IP is different from VRB1 implementation.
>>
>> Currently, the code is almost identical so vrb1 implementation should be
>> reused. If some later changes makes the two implementations diverge, then we
>> can consider having a dedicated function for VRB2 at that time.
> 
> If I may, I believe this is best as-is notably for patches and support.
> The functions are fairly small (not much code overlap quantitatively) and the underlying IP is different
> (with more differences we can enable over time). I don’t think it would help anyone really to try to make them
> coexist for a small period of time.
> Does that sound fair?

I disagree, as I explained the code currently is almost identical, so
just share the code.

You will diverge, if *really* necessary, when it will make more sense to
have two separate functions. For now it is not the case in my opinion.

Thanks,
Maxime

> 
> 
>>
>>>>
>>>>> +}
>>>>> +
>>>>>     static inline int
>>>>>     vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>>>>     		struct acc_dma_req_desc *desc,
>>>>> @@ -3882,6 +3944,58 @@ vrb1_dma_desc_fft_fill(struct
>>>>> rte_bbdev_fft_op
>>>> *op,
>>>>>     	return 0;
>>>>>     }
>>>>>
>>>>> +static inline int
>>>>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>>>> +		struct acc_dma_req_desc *desc,
>>>>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
>>>> rte_mbuf *win_input,
>>>>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
>>>> *out_offset,
>>>>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
>>>>> +	bool pwr_en = check_bit(op->fft.op_flags,
>>>> RTE_BBDEV_FFT_POWER_MEAS);
>>>>> +	bool win_en = check_bit(op->fft.op_flags,
>>>> RTE_BBDEV_FFT_DEWINDOWING);
>>>>> +	int num_cs = 0, i, bd_idx = 1;
>>>>> +
>>>>> +	/* FCW already done */
>>>>> +	acc_header_init(desc);
>>>>> +
>>>>> +	RTE_SET_USED(win_input);
>>>>> +	RTE_SET_USED(win_offset);
>>>>> +
>>>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
>>>> *in_offset);
>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
>>>> ACC_IQ_SIZE;
>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
>>>>> +	desc->data_ptrs[bd_idx].last = 1;
>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>> +	bd_idx++;
>>>>> +
>>>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output,
>>>> *out_offset);
>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
>>>> ACC_IQ_SIZE;
>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
>>>>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>> +	desc->m2dlen = win_en ? 3 : 2;
>>>>> +	desc->d2mlen = pwr_en ? 2 : 1;
>>>>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
>>>>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
>>>>> +
>>>>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
>>>>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
>>>>> +			num_cs++;
>>>>> +	desc->num_cs = num_cs;
>>>>> +
>>>>> +	if (pwr_en && pwr) {
>>>>> +		bd_idx++;
>>>>> +		desc->data_ptrs[bd_idx].address =
>>>> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
>>>>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
>>>>> fft.num_antennas_log2) * 4;
>>>>> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
>>>>> +		desc->data_ptrs[bd_idx].last = 1;
>>>>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>> +	}
>>>>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
>>>>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
>>>>> +	desc->op_addr = op;
>>>>> +	return 0;
>>>>> +}
>>>>>
>>>>>     /** Enqueue one FFT operation for device. */
>>>>>     static inline int
>>>>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue
>> *q,
>>>> struct rte_bbdev_fft_op *op,
>>>>>     		uint16_t total_enqueued_cbs)
>>>>>     {
>>>>>     	union acc_dma_desc *desc;
>>>>> -	struct rte_mbuf *input, *output;
>>>>> -	uint32_t in_offset, out_offset;
>>>>> +	struct rte_mbuf *input, *output, *pwr, *win;
>>>>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
>>>>>     	struct acc_fcw_fft *fcw;
>>>>>
>>>>>     	desc = acc_desc(q, total_enqueued_cbs);
>>>>>     	input = op->fft.base_input.data;
>>>>>     	output = op->fft.base_output.data;
>>>>> +	pwr = op->fft.power_meas_output.data;
>>>>> +	win = op->fft.dewindowing_input.data;
>>>>>     	in_offset = op->fft.base_input.offset;
>>>>>     	out_offset = op->fft.base_output.offset;
>>>>> +	pwr_offset = op->fft.power_meas_output.offset;
>>>>> +	win_offset = op->fft.dewindowing_input.offset;
>>>>>
>>>>>     	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
>>>>>     			((q->sw_ring_head + total_enqueued_cbs) & q-
>>>>> sw_ring_wrap_mask)
>>>>>     			* ACC_MAX_FCW_SIZE);
>>>>>
>>>>> -	vrb1_fcw_fft_fill(op, fcw);
>>>>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
>>>> &out_offset);
>>>>> +	if (q->d->device_variant == VRB1_VARIANT) {
>>>>> +		vrb1_fcw_fft_fill(op, fcw);
>>>>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
>>>> &in_offset, &out_offset);
>>>>> +	} else {
>>>>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
>>>>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
>>>> pwr,
>>>>> +				&in_offset, &out_offset, &win_offset,
>>>> &pwr_offset);
>>>>> +	}
>>>>>     #ifdef RTE_LIBRTE_BBDEV_DEBUG
>>>>>     	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
>>>>>     			sizeof(desc->req.fcw_fft));
>>>
>
  
Chautru, Nicolas Oct. 5, 2023, 5:59 p.m. UTC | #6
Hi Maxime, 

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Thursday, October 5, 2023 7:35 AM
> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
> <hernan.vargas@intel.com>
> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
> 
> 
> 
> On 10/4/23 23:18, Chautru, Nicolas wrote:
> > Hi Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Wednesday, October 4, 2023 12:11 AM
> >> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> Hernan
> >> <hernan.vargas@intel.com>
> >> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
> >> variant
> >>
> >>
> >>
> >> On 10/3/23 20:20, Chautru, Nicolas wrote:
> >>> Hi Maxime,
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>> Sent: Tuesday, October 3, 2023 7:37 AM
> >>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> >> Hernan
> >>>> <hernan.vargas@intel.com>
> >>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
> >>>> variant
> >>>>
> >>>>
> >>>>
> >>>> On 9/29/23 18:35, Nicolas Chautru wrote:
> >>>>> Support for the FFT the processing specific to the
> >>>>> VRB2 variant.
> >>>>>
> >>>>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> >>>>> ---
> >>>>>     drivers/baseband/acc/rte_vrb_pmd.c | 132
> >>>> ++++++++++++++++++++++++++++-
> >>>>>     1 file changed, 128 insertions(+), 4 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>> b/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>> index 93add82947..ce4b90d8e7 100644
> >>>>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev,
> >>>>> uint16_t
> >>>> queue_id,
> >>>>>     			ACC_FCW_LD_BLEN : (conf->op_type ==
> >>>> RTE_BBDEV_OP_FFT ?
> >>>>>     			ACC_FCW_FFT_BLEN :
> ACC_FCW_MLDTS_BLEN))));
> >>>>>
> >>>>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
> >>>> RTE_BBDEV_OP_FFT))
> >>>>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
> >>>>> +
> >>>>>     	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth;
> desc_idx++) {
> >>>>>     		desc = q->ring_addr + desc_idx;
> >>>>>     		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -
> 1323,6
> >>>> +1326,24 @@
> >>>>> vrb_dev_info_get(struct rte_bbdev *dev, struct
> >>>>> rte_bbdev_driver_info
> >>>> *dev_info)
> >>>>>     			.num_buffers_soft_out = 0,
> >>>>>     			}
> >>>>>     		},
> >>>>> +		{
> >>>>> +			.type	= RTE_BBDEV_OP_FFT,
> >>>>> +			.cap.fft = {
> >>>>> +				.capability_flags =
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_WINDOWING |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_DFT_BYPASS |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_IDFT_BYPASS |
> >>>>> +						RTE_BBDEV_FFT_FP16_INPUT
> >>>> |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_FP16_OUTPUT |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_POWER_MEAS |
> >>>>> +
> >>>> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
> >>>>> +				.num_buffers_src =
> >>>>> +						1,
> >>>>> +				.num_buffers_dst =
> >>>>> +						1,
> >>>>> +			}
> >>>>> +		},
> >>>>>     		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
> >>>>>     	};
> >>>>>
> >>>>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op
> >>>>> *op,
> >>>> struct acc_fcw_fft *fcw)
> >>>>>     		fcw->bypass = 0;
> >>>>>     }
> >>>>>
> >>>>> +/* Fill in a frame control word for FFT processing. */ static
> >>>>> +inline void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
> >>>>> +acc_fcw_fft_3 *fcw) {
> >>>>> +	fcw->in_frame_size = op->fft.input_sequence_size;
> >>>>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
> >>>>> +	fcw->out_frame_size = op->fft.output_sequence_size;
> >>>>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
> >>>>> +	fcw->cs_window_sel = op->fft.window_index[0] +
> >>>>> +			(op->fft.window_index[1] << 8) +
> >>>>> +			(op->fft.window_index[2] << 16) +
> >>>>> +			(op->fft.window_index[3] << 24);
> >>>>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
> >>>>> +			(op->fft.window_index[5] << 8);
> >>>>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
> >>>>> +	fcw->num_antennas = op->fft.num_antennas_log2;
> >>>>> +	fcw->idft_size = op->fft.idft_log2;
> >>>>> +	fcw->dft_size = op->fft.dft_log2;
> >>>>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
> >>>>> +	fcw->idft_shift = op->fft.idft_shift;
> >>>>> +	fcw->dft_shift = op->fft.dft_shift;
> >>>>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
> >>>>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
> >>>>> fft.fp16_exp_adjust;
> >>>>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
> >>>> RTE_BBDEV_FFT_FP16_INPUT);
> >>>>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
> >>>> RTE_BBDEV_FFT_FP16_OUTPUT);
> >>>>> +	fcw->power_en = check_bit(op->fft.op_flags,
> >>>> RTE_BBDEV_FFT_POWER_MEAS);
> >>>>> +	if (check_bit(op->fft.op_flags,
> >>>>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
> >>>>> +		if (check_bit(op->fft.op_flags,
> >>>>> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
> >>>>> +			fcw->bypass = 2;
> >>>>> +		else
> >>>>> +			fcw->bypass = 1;
> >>>>> +	} else if (check_bit(op->fft.op_flags,
> >>>>> +			RTE_BBDEV_FFT_DFT_BYPASS))
> >>>>> +		fcw->bypass = 3;
> >>>>> +	else
> >>>>> +		fcw->bypass = 0;
> >>>>
> >>>> The only difference I see with VRB1 are backed by corresponding
> >>>> op_flags (POWER & FP16), is that correct? If so, it does not make
> >>>> sense to me to have a specific function for VRB2.
> >>>
> >>> There are more changes but these are only formally enabled in the
> >>> next stepping hence some of the related code is not included yet.
> >>> More generally
> >> the FCW and IP is different from VRB1 implementation.
> >>
> >> Currently, the code is almost identical so vrb1 implementation should
> >> be reused. If some later changes makes the two implementations
> >> diverge, then we can consider having a dedicated function for VRB2 at that
> time.
> >
> > If I may, I believe this is best as-is notably for patches and support.
> > The functions are fairly small (not much code overlap quantitatively)
> > and the underlying IP is different (with more differences we can
> > enable over time). I don’t think it would help anyone really to try to make
> them coexist for a small period of time.
> > Does that sound fair?
> 
> I disagree, as I explained the code currently is almost identical, so just share the
> code.
> 
> You will diverge, if *really* necessary, when it will make more sense to have
> two separate functions. For now it is not the case in my opinion.

OK I had another look. I can share a common descriptor generation function. For the FCW generation these are just different structures and sizes, different prototype, I really don't think it would make sense to try to artificially generate them together.
Updating in new v5 this week. 


> 
> Thanks,
> Maxime
> 
> >
> >
> >>
> >>>>
> >>>>> +}
> >>>>> +
> >>>>>     static inline int
> >>>>>     vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>>>>     		struct acc_dma_req_desc *desc, @@ -3882,6
> +3944,58 @@
> >>>>> vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op
> >>>> *op,
> >>>>>     	return 0;
> >>>>>     }
> >>>>>
> >>>>> +static inline int
> >>>>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>>>> +		struct acc_dma_req_desc *desc,
> >>>>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
> >>>> rte_mbuf *win_input,
> >>>>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
> >>>> *out_offset,
> >>>>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
> >>>>> +	bool pwr_en = check_bit(op->fft.op_flags,
> >>>> RTE_BBDEV_FFT_POWER_MEAS);
> >>>>> +	bool win_en = check_bit(op->fft.op_flags,
> >>>> RTE_BBDEV_FFT_DEWINDOWING);
> >>>>> +	int num_cs = 0, i, bd_idx = 1;
> >>>>> +
> >>>>> +	/* FCW already done */
> >>>>> +	acc_header_init(desc);
> >>>>> +
> >>>>> +	RTE_SET_USED(win_input);
> >>>>> +	RTE_SET_USED(win_offset);
> >>>>> +
> >>>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
> >>>> *in_offset);
> >>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
> >>>> ACC_IQ_SIZE;
> >>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
> >>>>> +	desc->data_ptrs[bd_idx].last = 1;
> >>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>> +	bd_idx++;
> >>>>> +
> >>>>> +	desc->data_ptrs[bd_idx].address =
> >>>>> +rte_pktmbuf_iova_offset(output,
> >>>> *out_offset);
> >>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
> >>>> ACC_IQ_SIZE;
> >>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
> >>>>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
> >>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>> +	desc->m2dlen = win_en ? 3 : 2;
> >>>>> +	desc->d2mlen = pwr_en ? 2 : 1;
> >>>>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
> >>>>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
> >>>>> +
> >>>>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
> >>>>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
> >>>>> +			num_cs++;
> >>>>> +	desc->num_cs = num_cs;
> >>>>> +
> >>>>> +	if (pwr_en && pwr) {
> >>>>> +		bd_idx++;
> >>>>> +		desc->data_ptrs[bd_idx].address =
> >>>> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
> >>>>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
> >>>>> fft.num_antennas_log2) * 4;
> >>>>> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
> >>>>> +		desc->data_ptrs[bd_idx].last = 1;
> >>>>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>> +	}
> >>>>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
> >>>>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
> >>>>> +	desc->op_addr = op;
> >>>>> +	return 0;
> >>>>> +}
> >>>>>
> >>>>>     /** Enqueue one FFT operation for device. */
> >>>>>     static inline int
> >>>>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue
> >> *q,
> >>>> struct rte_bbdev_fft_op *op,
> >>>>>     		uint16_t total_enqueued_cbs)
> >>>>>     {
> >>>>>     	union acc_dma_desc *desc;
> >>>>> -	struct rte_mbuf *input, *output;
> >>>>> -	uint32_t in_offset, out_offset;
> >>>>> +	struct rte_mbuf *input, *output, *pwr, *win;
> >>>>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
> >>>>>     	struct acc_fcw_fft *fcw;
> >>>>>
> >>>>>     	desc = acc_desc(q, total_enqueued_cbs);
> >>>>>     	input = op->fft.base_input.data;
> >>>>>     	output = op->fft.base_output.data;
> >>>>> +	pwr = op->fft.power_meas_output.data;
> >>>>> +	win = op->fft.dewindowing_input.data;
> >>>>>     	in_offset = op->fft.base_input.offset;
> >>>>>     	out_offset = op->fft.base_output.offset;
> >>>>> +	pwr_offset = op->fft.power_meas_output.offset;
> >>>>> +	win_offset = op->fft.dewindowing_input.offset;
> >>>>>
> >>>>>     	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> >>>>>     			((q->sw_ring_head + total_enqueued_cbs) & q-
> >>>>> sw_ring_wrap_mask)
> >>>>>     			* ACC_MAX_FCW_SIZE);
> >>>>>
> >>>>> -	vrb1_fcw_fft_fill(op, fcw);
> >>>>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
> >>>> &out_offset);
> >>>>> +	if (q->d->device_variant == VRB1_VARIANT) {
> >>>>> +		vrb1_fcw_fft_fill(op, fcw);
> >>>>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
> >>>> &in_offset, &out_offset);
> >>>>> +	} else {
> >>>>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
> >>>>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
> >>>> pwr,
> >>>>> +				&in_offset, &out_offset, &win_offset,
> >>>> &pwr_offset);
> >>>>> +	}
> >>>>>     #ifdef RTE_LIBRTE_BBDEV_DEBUG
> >>>>>     	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
> >>>>>     			sizeof(desc->req.fcw_fft));
> >>>
> >
  
Maxime Coquelin Oct. 6, 2023, 12:05 p.m. UTC | #7
On 10/5/23 19:59, Chautru, Nicolas wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Thursday, October 5, 2023 7:35 AM
>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
>> <hernan.vargas@intel.com>
>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
>>
>>
>>
>> On 10/4/23 23:18, Chautru, Nicolas wrote:
>>> Hi Maxime,
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Wednesday, October 4, 2023 12:11 AM
>>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
>> Hernan
>>>> <hernan.vargas@intel.com>
>>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
>>>> variant
>>>>
>>>>
>>>>
>>>> On 10/3/23 20:20, Chautru, Nicolas wrote:
>>>>> Hi Maxime,
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>> Sent: Tuesday, October 3, 2023 7:37 AM
>>>>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
>>>>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
>>>> Hernan
>>>>>> <hernan.vargas@intel.com>
>>>>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
>>>>>> variant
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 9/29/23 18:35, Nicolas Chautru wrote:
>>>>>>> Support for the FFT the processing specific to the
>>>>>>> VRB2 variant.
>>>>>>>
>>>>>>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
>>>>>>> ---
>>>>>>>      drivers/baseband/acc/rte_vrb_pmd.c | 132
>>>>>> ++++++++++++++++++++++++++++-
>>>>>>>      1 file changed, 128 insertions(+), 4 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
>>>>>>> b/drivers/baseband/acc/rte_vrb_pmd.c
>>>>>>> index 93add82947..ce4b90d8e7 100644
>>>>>>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
>>>>>>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
>>>>>>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev,
>>>>>>> uint16_t
>>>>>> queue_id,
>>>>>>>      			ACC_FCW_LD_BLEN : (conf->op_type ==
>>>>>> RTE_BBDEV_OP_FFT ?
>>>>>>>      			ACC_FCW_FFT_BLEN :
>> ACC_FCW_MLDTS_BLEN))));
>>>>>>>
>>>>>>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type ==
>>>>>> RTE_BBDEV_OP_FFT))
>>>>>>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
>>>>>>> +
>>>>>>>      	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth;
>> desc_idx++) {
>>>>>>>      		desc = q->ring_addr + desc_idx;
>>>>>>>      		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -
>> 1323,6
>>>>>> +1326,24 @@
>>>>>>> vrb_dev_info_get(struct rte_bbdev *dev, struct
>>>>>>> rte_bbdev_driver_info
>>>>>> *dev_info)
>>>>>>>      			.num_buffers_soft_out = 0,
>>>>>>>      			}
>>>>>>>      		},
>>>>>>> +		{
>>>>>>> +			.type	= RTE_BBDEV_OP_FFT,
>>>>>>> +			.cap.fft = {
>>>>>>> +				.capability_flags =
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_WINDOWING |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_DFT_BYPASS |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_IDFT_BYPASS |
>>>>>>> +						RTE_BBDEV_FFT_FP16_INPUT
>>>>>> |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_FP16_OUTPUT |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_POWER_MEAS |
>>>>>>> +
>>>>>> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
>>>>>>> +				.num_buffers_src =
>>>>>>> +						1,
>>>>>>> +				.num_buffers_dst =
>>>>>>> +						1,
>>>>>>> +			}
>>>>>>> +		},
>>>>>>>      		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
>>>>>>>      	};
>>>>>>>
>>>>>>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op
>>>>>>> *op,
>>>>>> struct acc_fcw_fft *fcw)
>>>>>>>      		fcw->bypass = 0;
>>>>>>>      }
>>>>>>>
>>>>>>> +/* Fill in a frame control word for FFT processing. */ static
>>>>>>> +inline void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct
>>>>>>> +acc_fcw_fft_3 *fcw) {
>>>>>>> +	fcw->in_frame_size = op->fft.input_sequence_size;
>>>>>>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
>>>>>>> +	fcw->out_frame_size = op->fft.output_sequence_size;
>>>>>>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
>>>>>>> +	fcw->cs_window_sel = op->fft.window_index[0] +
>>>>>>> +			(op->fft.window_index[1] << 8) +
>>>>>>> +			(op->fft.window_index[2] << 16) +
>>>>>>> +			(op->fft.window_index[3] << 24);
>>>>>>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
>>>>>>> +			(op->fft.window_index[5] << 8);
>>>>>>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
>>>>>>> +	fcw->num_antennas = op->fft.num_antennas_log2;
>>>>>>> +	fcw->idft_size = op->fft.idft_log2;
>>>>>>> +	fcw->dft_size = op->fft.dft_log2;
>>>>>>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
>>>>>>> +	fcw->idft_shift = op->fft.idft_shift;
>>>>>>> +	fcw->dft_shift = op->fft.dft_shift;
>>>>>>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
>>>>>>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj = op-
>>>>>>> fft.fp16_exp_adjust;
>>>>>>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
>>>>>> RTE_BBDEV_FFT_FP16_INPUT);
>>>>>>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
>>>>>> RTE_BBDEV_FFT_FP16_OUTPUT);
>>>>>>> +	fcw->power_en = check_bit(op->fft.op_flags,
>>>>>> RTE_BBDEV_FFT_POWER_MEAS);
>>>>>>> +	if (check_bit(op->fft.op_flags,
>>>>>>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
>>>>>>> +		if (check_bit(op->fft.op_flags,
>>>>>>> +				RTE_BBDEV_FFT_WINDOWING_BYPASS))
>>>>>>> +			fcw->bypass = 2;
>>>>>>> +		else
>>>>>>> +			fcw->bypass = 1;
>>>>>>> +	} else if (check_bit(op->fft.op_flags,
>>>>>>> +			RTE_BBDEV_FFT_DFT_BYPASS))
>>>>>>> +		fcw->bypass = 3;
>>>>>>> +	else
>>>>>>> +		fcw->bypass = 0;
>>>>>>
>>>>>> The only difference I see with VRB1 are backed by corresponding
>>>>>> op_flags (POWER & FP16), is that correct? If so, it does not make
>>>>>> sense to me to have a specific function for VRB2.
>>>>>
>>>>> There are more changes but these are only formally enabled in the
>>>>> next stepping hence some of the related code is not included yet.
>>>>> More generally
>>>> the FCW and IP is different from VRB1 implementation.
>>>>
>>>> Currently, the code is almost identical so vrb1 implementation should
>>>> be reused. If some later changes makes the two implementations
>>>> diverge, then we can consider having a dedicated function for VRB2 at that
>> time.
>>>
>>> If I may, I believe this is best as-is notably for patches and support.
>>> The functions are fairly small (not much code overlap quantitatively)
>>> and the underlying IP is different (with more differences we can
>>> enable over time). I don’t think it would help anyone really to try to make
>> them coexist for a small period of time.
>>> Does that sound fair?
>>
>> I disagree, as I explained the code currently is almost identical, so just share the
>> code.
>>
>> You will diverge, if *really* necessary, when it will make more sense to have
>> two separate functions. For now it is not the case in my opinion.
> 
> OK I had another look. I can share a common descriptor generation function. For the FCW generation these are just different structures and sizes, different prototype, I really don't think it would make sense to try to artificially generate them together.
> Updating in new v5 this week.

struct __rte_packed acc_fcw_fft {
	uint32_t in_frame_size:16,
		leading_pad_size:16;
	uint32_t out_frame_size:16,
		leading_depad_size:16;
	uint32_t cs_window_sel;
	uint32_t cs_window_sel2:16,
		cs_enable_bmap:16;
	uint32_t num_antennas:8,
		idft_size:8,
		dft_size:8,
		cs_offset:8;
	uint32_t idft_shift:8,
		dft_shift:8,
		cs_multiplier:16;
	uint32_t bypass:2,
		fp16_in:1, /* Not supported in VRB1 */
		fp16_out:1,
		exp_adj:4,
		power_shift:4,
		power_en:1,
		res:19;
};

+struct __rte_packed acc_fcw_fft_3 {
	uint32_t in_frame_size:16,
		leading_pad_size:16;
	uint32_t out_frame_size:16,
		leading_depad_size:16;
	uint32_t cs_window_sel;
	uint32_t cs_window_sel2:16,
		cs_enable_bmap:16;
	uint32_t num_antennas:8,
		idft_size:8,
		dft_size:8,
		cs_offset:8;
	uint32_t idft_shift:8,
		dft_shift:8,
		cs_multiplier:16;
	uint32_t bypass:2,
		fp16_in:1,
		fp16_out:1,
		exp_adj:4,
		power_shift:4,
		power_en:1,

===> New fields:
		enable_dewin:1,
		freq_resample_mode:2,
		depad_output_size:16;
	uint16_t cs_theta_0[ACC_MAX_CS];
	uint32_t cs_theta_d[ACC_MAX_CS];
	int8_t cs_time_offset[ACC_MAX_CS];
};

HW designers did it right with SW in mind. There a just new fields on
VRB2, so IMHO it can be shared also.

> 
> 
>>
>> Thanks,
>> Maxime
>>
>>>
>>>
>>>>
>>>>>>
>>>>>>> +}
>>>>>>> +
>>>>>>>      static inline int
>>>>>>>      vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>>>>>>      		struct acc_dma_req_desc *desc, @@ -3882,6
>> +3944,58 @@
>>>>>>> vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op
>>>>>> *op,
>>>>>>>      	return 0;
>>>>>>>      }
>>>>>>>
>>>>>>> +static inline int
>>>>>>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
>>>>>>> +		struct acc_dma_req_desc *desc,
>>>>>>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
>>>>>> rte_mbuf *win_input,
>>>>>>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
>>>>>> *out_offset,
>>>>>>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
>>>>>>> +	bool pwr_en = check_bit(op->fft.op_flags,
>>>>>> RTE_BBDEV_FFT_POWER_MEAS);
>>>>>>> +	bool win_en = check_bit(op->fft.op_flags,
>>>>>> RTE_BBDEV_FFT_DEWINDOWING);
>>>>>>> +	int num_cs = 0, i, bd_idx = 1;
>>>>>>> +
>>>>>>> +	/* FCW already done */
>>>>>>> +	acc_header_init(desc);
>>>>>>> +
>>>>>>> +	RTE_SET_USED(win_input);
>>>>>>> +	RTE_SET_USED(win_offset);
>>>>>>> +
>>>>>>> +	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input,
>>>>>> *in_offset);
>>>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
>>>>>> ACC_IQ_SIZE;
>>>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
>>>>>>> +	desc->data_ptrs[bd_idx].last = 1;
>>>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>>>> +	bd_idx++;
>>>>>>> +
>>>>>>> +	desc->data_ptrs[bd_idx].address =
>>>>>>> +rte_pktmbuf_iova_offset(output,
>>>>>> *out_offset);
>>>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
>>>>>> ACC_IQ_SIZE;
>>>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
>>>>>>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
>>>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>>>> +	desc->m2dlen = win_en ? 3 : 2;
>>>>>>> +	desc->d2mlen = pwr_en ? 2 : 1;
>>>>>>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
>>>>>>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
>>>>>>> +
>>>>>>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
>>>>>>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
>>>>>>> +			num_cs++;
>>>>>>> +	desc->num_cs = num_cs;
>>>>>>> +
>>>>>>> +	if (pwr_en && pwr) {
>>>>>>> +		bd_idx++;
>>>>>>> +		desc->data_ptrs[bd_idx].address =
>>>>>> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
>>>>>>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
>>>>>>> fft.num_antennas_log2) * 4;
>>>>>>> +		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
>>>>>>> +		desc->data_ptrs[bd_idx].last = 1;
>>>>>>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
>>>>>>> +	}
>>>>>>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
>>>>>>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
>>>>>>> +	desc->op_addr = op;
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>>
>>>>>>>      /** Enqueue one FFT operation for device. */
>>>>>>>      static inline int
>>>>>>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct acc_queue
>>>> *q,
>>>>>> struct rte_bbdev_fft_op *op,
>>>>>>>      		uint16_t total_enqueued_cbs)
>>>>>>>      {
>>>>>>>      	union acc_dma_desc *desc;
>>>>>>> -	struct rte_mbuf *input, *output;
>>>>>>> -	uint32_t in_offset, out_offset;
>>>>>>> +	struct rte_mbuf *input, *output, *pwr, *win;
>>>>>>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
>>>>>>>      	struct acc_fcw_fft *fcw;
>>>>>>>
>>>>>>>      	desc = acc_desc(q, total_enqueued_cbs);
>>>>>>>      	input = op->fft.base_input.data;
>>>>>>>      	output = op->fft.base_output.data;
>>>>>>> +	pwr = op->fft.power_meas_output.data;
>>>>>>> +	win = op->fft.dewindowing_input.data;
>>>>>>>      	in_offset = op->fft.base_input.offset;
>>>>>>>      	out_offset = op->fft.base_output.offset;
>>>>>>> +	pwr_offset = op->fft.power_meas_output.offset;
>>>>>>> +	win_offset = op->fft.dewindowing_input.offset;
>>>>>>>
>>>>>>>      	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
>>>>>>>      			((q->sw_ring_head + total_enqueued_cbs) & q-
>>>>>>> sw_ring_wrap_mask)
>>>>>>>      			* ACC_MAX_FCW_SIZE);
>>>>>>>
>>>>>>> -	vrb1_fcw_fft_fill(op, fcw);
>>>>>>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
>>>>>> &out_offset);
>>>>>>> +	if (q->d->device_variant == VRB1_VARIANT) {
>>>>>>> +		vrb1_fcw_fft_fill(op, fcw);
>>>>>>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
>>>>>> &in_offset, &out_offset);
>>>>>>> +	} else {
>>>>>>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
>>>>>>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win,
>>>>>> pwr,
>>>>>>> +				&in_offset, &out_offset, &win_offset,
>>>>>> &pwr_offset);
>>>>>>> +	}
>>>>>>>      #ifdef RTE_LIBRTE_BBDEV_DEBUG
>>>>>>>      	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
>>>>>>>      			sizeof(desc->req.fcw_fft));
>>>>>
>>>
>
  
Chautru, Nicolas Oct. 6, 2023, 8:25 p.m. UTC | #8
Hi Maxime, 

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Friday, October 6, 2023 5:06 AM
> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
> <hernan.vargas@intel.com>
> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2 variant
> 
> 
> 
> On 10/5/23 19:59, Chautru, Nicolas wrote:
> > Hi Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Thursday, October 5, 2023 7:35 AM
> >> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> Hernan
> >> <hernan.vargas@intel.com>
> >> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
> >> variant
> >>
> >>
> >>
> >> On 10/4/23 23:18, Chautru, Nicolas wrote:
> >>> Hi Maxime,
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>> Sent: Wednesday, October 4, 2023 12:11 AM
> >>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> >> Hernan
> >>>> <hernan.vargas@intel.com>
> >>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to VRB2
> >>>> variant
> >>>>
> >>>>
> >>>>
> >>>> On 10/3/23 20:20, Chautru, Nicolas wrote:
> >>>>> Hi Maxime,
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>>>> Sent: Tuesday, October 3, 2023 7:37 AM
> >>>>>> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> >>>>>> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas,
> >>>> Hernan
> >>>>>> <hernan.vargas@intel.com>
> >>>>>> Subject: Re: [PATCH v3 09/12] baseband/acc: add FFT support to
> >>>>>> VRB2 variant
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 9/29/23 18:35, Nicolas Chautru wrote:
> >>>>>>> Support for the FFT the processing specific to the
> >>>>>>> VRB2 variant.
> >>>>>>>
> >>>>>>> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> >>>>>>> ---
> >>>>>>>      drivers/baseband/acc/rte_vrb_pmd.c | 132
> >>>>>> ++++++++++++++++++++++++++++-
> >>>>>>>      1 file changed, 128 insertions(+), 4 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>>>> b/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>>>> index 93add82947..ce4b90d8e7 100644
> >>>>>>> --- a/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>>>> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> >>>>>>> @@ -903,6 +903,9 @@ vrb_queue_setup(struct rte_bbdev *dev,
> >>>>>>> uint16_t
> >>>>>> queue_id,
> >>>>>>>      			ACC_FCW_LD_BLEN : (conf->op_type ==
> >>>>>> RTE_BBDEV_OP_FFT ?
> >>>>>>>      			ACC_FCW_FFT_BLEN :
> >> ACC_FCW_MLDTS_BLEN))));
> >>>>>>>
> >>>>>>> +	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type
> >>>>>>> +==
> >>>>>> RTE_BBDEV_OP_FFT))
> >>>>>>> +		fcw_len = ACC_FCW_FFT_BLEN_3;
> >>>>>>> +
> >>>>>>>      	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth;
> >> desc_idx++) {
> >>>>>>>      		desc = q->ring_addr + desc_idx;
> >>>>>>>      		desc->req.word0 = ACC_DMA_DESC_TYPE; @@ -
> >> 1323,6
> >>>>>> +1326,24 @@
> >>>>>>> vrb_dev_info_get(struct rte_bbdev *dev, struct
> >>>>>>> rte_bbdev_driver_info
> >>>>>> *dev_info)
> >>>>>>>      			.num_buffers_soft_out = 0,
> >>>>>>>      			}
> >>>>>>>      		},
> >>>>>>> +		{
> >>>>>>> +			.type	= RTE_BBDEV_OP_FFT,
> >>>>>>> +			.cap.fft = {
> >>>>>>> +				.capability_flags =
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_WINDOWING |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_CS_ADJUSTMENT |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_DFT_BYPASS |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_IDFT_BYPASS |
> >>>>>>> +
> 	RTE_BBDEV_FFT_FP16_INPUT
> >>>>>> |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_FP16_OUTPUT |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_POWER_MEAS |
> >>>>>>> +
> >>>>>> 	RTE_BBDEV_FFT_WINDOWING_BYPASS,
> >>>>>>> +				.num_buffers_src =
> >>>>>>> +						1,
> >>>>>>> +				.num_buffers_dst =
> >>>>>>> +						1,
> >>>>>>> +			}
> >>>>>>> +		},
> >>>>>>>      		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
> >>>>>>>      	};
> >>>>>>>
> >>>>>>> @@ -3849,6 +3870,47 @@ vrb1_fcw_fft_fill(struct rte_bbdev_fft_op
> >>>>>>> *op,
> >>>>>> struct acc_fcw_fft *fcw)
> >>>>>>>      		fcw->bypass = 0;
> >>>>>>>      }
> >>>>>>>
> >>>>>>> +/* Fill in a frame control word for FFT processing. */ static
> >>>>>>> +inline void vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op,
> >>>>>>> +struct
> >>>>>>> +acc_fcw_fft_3 *fcw) {
> >>>>>>> +	fcw->in_frame_size = op->fft.input_sequence_size;
> >>>>>>> +	fcw->leading_pad_size = op->fft.input_leading_padding;
> >>>>>>> +	fcw->out_frame_size = op->fft.output_sequence_size;
> >>>>>>> +	fcw->leading_depad_size = op->fft.output_leading_depadding;
> >>>>>>> +	fcw->cs_window_sel = op->fft.window_index[0] +
> >>>>>>> +			(op->fft.window_index[1] << 8) +
> >>>>>>> +			(op->fft.window_index[2] << 16) +
> >>>>>>> +			(op->fft.window_index[3] << 24);
> >>>>>>> +	fcw->cs_window_sel2 = op->fft.window_index[4] +
> >>>>>>> +			(op->fft.window_index[5] << 8);
> >>>>>>> +	fcw->cs_enable_bmap = op->fft.cs_bitmap;
> >>>>>>> +	fcw->num_antennas = op->fft.num_antennas_log2;
> >>>>>>> +	fcw->idft_size = op->fft.idft_log2;
> >>>>>>> +	fcw->dft_size = op->fft.dft_log2;
> >>>>>>> +	fcw->cs_offset = op->fft.cs_time_adjustment;
> >>>>>>> +	fcw->idft_shift = op->fft.idft_shift;
> >>>>>>> +	fcw->dft_shift = op->fft.dft_shift;
> >>>>>>> +	fcw->cs_multiplier = op->fft.ncs_reciprocal;
> >>>>>>> +	fcw->power_shift = op->fft.power_shift; > +	fcw->exp_adj
> = op-
> >>>>>>> fft.fp16_exp_adjust;
> >>>>>>> +	fcw->fp16_in = check_bit(op->fft.op_flags,
> >>>>>> RTE_BBDEV_FFT_FP16_INPUT);
> >>>>>>> +	fcw->fp16_out = check_bit(op->fft.op_flags,
> >>>>>> RTE_BBDEV_FFT_FP16_OUTPUT);
> >>>>>>> +	fcw->power_en = check_bit(op->fft.op_flags,
> >>>>>> RTE_BBDEV_FFT_POWER_MEAS);
> >>>>>>> +	if (check_bit(op->fft.op_flags,
> >>>>>>> +			RTE_BBDEV_FFT_IDFT_BYPASS)) {
> >>>>>>> +		if (check_bit(op->fft.op_flags,
> >>>>>>> +
> 	RTE_BBDEV_FFT_WINDOWING_BYPASS))
> >>>>>>> +			fcw->bypass = 2;
> >>>>>>> +		else
> >>>>>>> +			fcw->bypass = 1;
> >>>>>>> +	} else if (check_bit(op->fft.op_flags,
> >>>>>>> +			RTE_BBDEV_FFT_DFT_BYPASS))
> >>>>>>> +		fcw->bypass = 3;
> >>>>>>> +	else
> >>>>>>> +		fcw->bypass = 0;
> >>>>>>
> >>>>>> The only difference I see with VRB1 are backed by corresponding
> >>>>>> op_flags (POWER & FP16), is that correct? If so, it does not make
> >>>>>> sense to me to have a specific function for VRB2.
> >>>>>
> >>>>> There are more changes but these are only formally enabled in the
> >>>>> next stepping hence some of the related code is not included yet.
> >>>>> More generally
> >>>> the FCW and IP is different from VRB1 implementation.
> >>>>
> >>>> Currently, the code is almost identical so vrb1 implementation
> >>>> should be reused. If some later changes makes the two
> >>>> implementations diverge, then we can consider having a dedicated
> >>>> function for VRB2 at that
> >> time.
> >>>
> >>> If I may, I believe this is best as-is notably for patches and support.
> >>> The functions are fairly small (not much code overlap
> >>> quantitatively) and the underlying IP is different (with more
> >>> differences we can enable over time). I don’t think it would help
> >>> anyone really to try to make
> >> them coexist for a small period of time.
> >>> Does that sound fair?
> >>
> >> I disagree, as I explained the code currently is almost identical, so
> >> just share the code.
> >>
> >> You will diverge, if *really* necessary, when it will make more sense
> >> to have two separate functions. For now it is not the case in my opinion.
> >
> > OK I had another look. I can share a common descriptor generation function.
> For the FCW generation these are just different structures and sizes, different
> prototype, I really don't think it would make sense to try to artificially generate
> them together.
> > Updating in new v5 this week.
> 
> struct __rte_packed acc_fcw_fft {
> 	uint32_t in_frame_size:16,
> 		leading_pad_size:16;
> 	uint32_t out_frame_size:16,
> 		leading_depad_size:16;
> 	uint32_t cs_window_sel;
> 	uint32_t cs_window_sel2:16,
> 		cs_enable_bmap:16;
> 	uint32_t num_antennas:8,
> 		idft_size:8,
> 		dft_size:8,
> 		cs_offset:8;
> 	uint32_t idft_shift:8,
> 		dft_shift:8,
> 		cs_multiplier:16;
> 	uint32_t bypass:2,
> 		fp16_in:1, /* Not supported in VRB1 */
> 		fp16_out:1,
> 		exp_adj:4,
> 		power_shift:4,
> 		power_en:1,
> 		res:19;
> };
> 
> +struct __rte_packed acc_fcw_fft_3 {
> 	uint32_t in_frame_size:16,
> 		leading_pad_size:16;
> 	uint32_t out_frame_size:16,
> 		leading_depad_size:16;
> 	uint32_t cs_window_sel;
> 	uint32_t cs_window_sel2:16,
> 		cs_enable_bmap:16;
> 	uint32_t num_antennas:8,
> 		idft_size:8,
> 		dft_size:8,
> 		cs_offset:8;
> 	uint32_t idft_shift:8,
> 		dft_shift:8,
> 		cs_multiplier:16;
> 	uint32_t bypass:2,
> 		fp16_in:1,
> 		fp16_out:1,
> 		exp_adj:4,
> 		power_shift:4,
> 		power_en:1,
> 
> ===> New fields:
> 		enable_dewin:1,
> 		freq_resample_mode:2,
> 		depad_output_size:16;
> 	uint16_t cs_theta_0[ACC_MAX_CS];
> 	uint32_t cs_theta_d[ACC_MAX_CS];
> 	int8_t cs_time_offset[ACC_MAX_CS];
> };
> 
> HW designers did it right with SW in mind. There a just new fields on VRB2, so
> IMHO it can be shared also.

I really don’t believe we should, they have different size and different FCW. 
I don't want to obfuscate this in the code and make it look
artificially as if they have the same FCW by doing cast for structure of different size. 
Also there are extra stepping variations that we have to manage here. 

I believe the previous suggestion to change the descriptor was very valuable
but on this very one that would be too artificial to me for the reasons above. 

Thanks

> 
> >
> >
> >>
> >> Thanks,
> >> Maxime
> >>
> >>>
> >>>
> >>>>
> >>>>>>
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>      static inline int
> >>>>>>>      vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>>>>>>      		struct acc_dma_req_desc *desc, @@ -3882,6
> >> +3944,58 @@
> >>>>>>> vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op
> >>>>>> *op,
> >>>>>>>      	return 0;
> >>>>>>>      }
> >>>>>>>
> >>>>>>> +static inline int
> >>>>>>> +vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
> >>>>>>> +		struct acc_dma_req_desc *desc,
> >>>>>>> +		struct rte_mbuf *input, struct rte_mbuf *output, struct
> >>>>>> rte_mbuf *win_input,
> >>>>>>> +		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t
> >>>>>> *out_offset,
> >>>>>>> +		uint32_t *win_offset, uint32_t *pwr_offset) {
> >>>>>>> +	bool pwr_en = check_bit(op->fft.op_flags,
> >>>>>> RTE_BBDEV_FFT_POWER_MEAS);
> >>>>>>> +	bool win_en = check_bit(op->fft.op_flags,
> >>>>>> RTE_BBDEV_FFT_DEWINDOWING);
> >>>>>>> +	int num_cs = 0, i, bd_idx = 1;
> >>>>>>> +
> >>>>>>> +	/* FCW already done */
> >>>>>>> +	acc_header_init(desc);
> >>>>>>> +
> >>>>>>> +	RTE_SET_USED(win_input);
> >>>>>>> +	RTE_SET_USED(win_offset);
> >>>>>>> +
> >>>>>>> +	desc->data_ptrs[bd_idx].address =
> >>>>>>> +rte_pktmbuf_iova_offset(input,
> >>>>>> *in_offset);
> >>>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size *
> >>>>>> ACC_IQ_SIZE;
> >>>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
> >>>>>>> +	desc->data_ptrs[bd_idx].last = 1;
> >>>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>>>> +	bd_idx++;
> >>>>>>> +
> >>>>>>> +	desc->data_ptrs[bd_idx].address =
> >>>>>>> +rte_pktmbuf_iova_offset(output,
> >>>>>> *out_offset);
> >>>>>>> +	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size *
> >>>>>> ACC_IQ_SIZE;
> >>>>>>> +	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
> >>>>>>> +	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
> >>>>>>> +	desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>>>> +	desc->m2dlen = win_en ? 3 : 2;
> >>>>>>> +	desc->d2mlen = pwr_en ? 2 : 1;
> >>>>>>> +	desc->ib_ant_offset = op->fft.input_sequence_size;
> >>>>>>> +	desc->num_ant = op->fft.num_antennas_log2 - 3;
> >>>>>>> +
> >>>>>>> +	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
> >>>>>>> +		if (check_bit(op->fft.cs_bitmap, 1 << i))
> >>>>>>> +			num_cs++;
> >>>>>>> +	desc->num_cs = num_cs;
> >>>>>>> +
> >>>>>>> +	if (pwr_en && pwr) {
> >>>>>>> +		bd_idx++;
> >>>>>>> +		desc->data_ptrs[bd_idx].address =
> >>>>>> rte_pktmbuf_iova_offset(pwr, *pwr_offset);
> >>>>>>> +		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op-
> >>>>>>> fft.num_antennas_log2) * 4;
> >>>>>>> +		desc->data_ptrs[bd_idx].blkid =
> ACC_DMA_BLKID_OUT_SOFT;
> >>>>>>> +		desc->data_ptrs[bd_idx].last = 1;
> >>>>>>> +		desc->data_ptrs[bd_idx].dma_ext = 0;
> >>>>>>> +	}
> >>>>>>> +	desc->ob_cyc_offset = op->fft.output_sequence_size;
> >>>>>>> +	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
> >>>>>>> +	desc->op_addr = op;
> >>>>>>> +	return 0;
> >>>>>>> +}
> >>>>>>>
> >>>>>>>      /** Enqueue one FFT operation for device. */
> >>>>>>>      static inline int
> >>>>>>> @@ -3889,22 +4003,32 @@ vrb_enqueue_fft_one_op(struct
> acc_queue
> >>>> *q,
> >>>>>> struct rte_bbdev_fft_op *op,
> >>>>>>>      		uint16_t total_enqueued_cbs)
> >>>>>>>      {
> >>>>>>>      	union acc_dma_desc *desc;
> >>>>>>> -	struct rte_mbuf *input, *output;
> >>>>>>> -	uint32_t in_offset, out_offset;
> >>>>>>> +	struct rte_mbuf *input, *output, *pwr, *win;
> >>>>>>> +	uint32_t in_offset, out_offset, pwr_offset, win_offset;
> >>>>>>>      	struct acc_fcw_fft *fcw;
> >>>>>>>
> >>>>>>>      	desc = acc_desc(q, total_enqueued_cbs);
> >>>>>>>      	input = op->fft.base_input.data;
> >>>>>>>      	output = op->fft.base_output.data;
> >>>>>>> +	pwr = op->fft.power_meas_output.data;
> >>>>>>> +	win = op->fft.dewindowing_input.data;
> >>>>>>>      	in_offset = op->fft.base_input.offset;
> >>>>>>>      	out_offset = op->fft.base_output.offset;
> >>>>>>> +	pwr_offset = op->fft.power_meas_output.offset;
> >>>>>>> +	win_offset = op->fft.dewindowing_input.offset;
> >>>>>>>
> >>>>>>>      	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> >>>>>>>      			((q->sw_ring_head + total_enqueued_cbs) & q-
> >>>>>>> sw_ring_wrap_mask)
> >>>>>>>      			* ACC_MAX_FCW_SIZE);
> >>>>>>>
> >>>>>>> -	vrb1_fcw_fft_fill(op, fcw);
> >>>>>>> -	vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
> &in_offset,
> >>>>>> &out_offset);
> >>>>>>> +	if (q->d->device_variant == VRB1_VARIANT) {
> >>>>>>> +		vrb1_fcw_fft_fill(op, fcw);
> >>>>>>> +		vrb1_dma_desc_fft_fill(op, &desc->req, input, output,
> >>>>>> &in_offset, &out_offset);
> >>>>>>> +	} else {
> >>>>>>> +		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
> >>>>>>> +		vrb2_dma_desc_fft_fill(op, &desc->req, input, output,
> win,
> >>>>>> pwr,
> >>>>>>> +				&in_offset, &out_offset, &win_offset,
> >>>>>> &pwr_offset);
> >>>>>>> +	}
> >>>>>>>      #ifdef RTE_LIBRTE_BBDEV_DEBUG
> >>>>>>>      	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
> >>>>>>>      			sizeof(desc->req.fcw_fft));
> >>>>>
> >>>
> >
  

Patch

diff --git a/drivers/baseband/acc/rte_vrb_pmd.c b/drivers/baseband/acc/rte_vrb_pmd.c
index 93add82947..ce4b90d8e7 100644
--- a/drivers/baseband/acc/rte_vrb_pmd.c
+++ b/drivers/baseband/acc/rte_vrb_pmd.c
@@ -903,6 +903,9 @@  vrb_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
 			ACC_FCW_LD_BLEN : (conf->op_type == RTE_BBDEV_OP_FFT ?
 			ACC_FCW_FFT_BLEN : ACC_FCW_MLDTS_BLEN))));
 
+	if ((q->d->device_variant == VRB2_VARIANT) && (conf->op_type == RTE_BBDEV_OP_FFT))
+		fcw_len = ACC_FCW_FFT_BLEN_3;
+
 	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
 		desc = q->ring_addr + desc_idx;
 		desc->req.word0 = ACC_DMA_DESC_TYPE;
@@ -1323,6 +1326,24 @@  vrb_dev_info_get(struct rte_bbdev *dev, struct rte_bbdev_driver_info *dev_info)
 			.num_buffers_soft_out = 0,
 			}
 		},
+		{
+			.type	= RTE_BBDEV_OP_FFT,
+			.cap.fft = {
+				.capability_flags =
+						RTE_BBDEV_FFT_WINDOWING |
+						RTE_BBDEV_FFT_CS_ADJUSTMENT |
+						RTE_BBDEV_FFT_DFT_BYPASS |
+						RTE_BBDEV_FFT_IDFT_BYPASS |
+						RTE_BBDEV_FFT_FP16_INPUT |
+						RTE_BBDEV_FFT_FP16_OUTPUT |
+						RTE_BBDEV_FFT_POWER_MEAS |
+						RTE_BBDEV_FFT_WINDOWING_BYPASS,
+				.num_buffers_src =
+						1,
+				.num_buffers_dst =
+						1,
+			}
+		},
 		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
 	};
 
@@ -3849,6 +3870,47 @@  vrb1_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct acc_fcw_fft *fcw)
 		fcw->bypass = 0;
 }
 
+/* Fill in a frame control word for FFT processing. */
+static inline void
+vrb2_fcw_fft_fill(struct rte_bbdev_fft_op *op, struct acc_fcw_fft_3 *fcw)
+{
+	fcw->in_frame_size = op->fft.input_sequence_size;
+	fcw->leading_pad_size = op->fft.input_leading_padding;
+	fcw->out_frame_size = op->fft.output_sequence_size;
+	fcw->leading_depad_size = op->fft.output_leading_depadding;
+	fcw->cs_window_sel = op->fft.window_index[0] +
+			(op->fft.window_index[1] << 8) +
+			(op->fft.window_index[2] << 16) +
+			(op->fft.window_index[3] << 24);
+	fcw->cs_window_sel2 = op->fft.window_index[4] +
+			(op->fft.window_index[5] << 8);
+	fcw->cs_enable_bmap = op->fft.cs_bitmap;
+	fcw->num_antennas = op->fft.num_antennas_log2;
+	fcw->idft_size = op->fft.idft_log2;
+	fcw->dft_size = op->fft.dft_log2;
+	fcw->cs_offset = op->fft.cs_time_adjustment;
+	fcw->idft_shift = op->fft.idft_shift;
+	fcw->dft_shift = op->fft.dft_shift;
+	fcw->cs_multiplier = op->fft.ncs_reciprocal;
+	fcw->power_shift = op->fft.power_shift;
+	fcw->exp_adj = op->fft.fp16_exp_adjust;
+	fcw->fp16_in = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_FP16_INPUT);
+	fcw->fp16_out = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_FP16_OUTPUT);
+	fcw->power_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_POWER_MEAS);
+	if (check_bit(op->fft.op_flags,
+			RTE_BBDEV_FFT_IDFT_BYPASS)) {
+		if (check_bit(op->fft.op_flags,
+				RTE_BBDEV_FFT_WINDOWING_BYPASS))
+			fcw->bypass = 2;
+		else
+			fcw->bypass = 1;
+	} else if (check_bit(op->fft.op_flags,
+			RTE_BBDEV_FFT_DFT_BYPASS))
+		fcw->bypass = 3;
+	else
+		fcw->bypass = 0;
+}
+
 static inline int
 vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
 		struct acc_dma_req_desc *desc,
@@ -3882,6 +3944,58 @@  vrb1_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
 	return 0;
 }
 
+static inline int
+vrb2_dma_desc_fft_fill(struct rte_bbdev_fft_op *op,
+		struct acc_dma_req_desc *desc,
+		struct rte_mbuf *input, struct rte_mbuf *output, struct rte_mbuf *win_input,
+		struct rte_mbuf *pwr, uint32_t *in_offset, uint32_t *out_offset,
+		uint32_t *win_offset, uint32_t *pwr_offset)
+{
+	bool pwr_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_POWER_MEAS);
+	bool win_en = check_bit(op->fft.op_flags, RTE_BBDEV_FFT_DEWINDOWING);
+	int num_cs = 0, i, bd_idx = 1;
+
+	/* FCW already done */
+	acc_header_init(desc);
+
+	RTE_SET_USED(win_input);
+	RTE_SET_USED(win_offset);
+
+	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(input, *in_offset);
+	desc->data_ptrs[bd_idx].blen = op->fft.input_sequence_size * ACC_IQ_SIZE;
+	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_IN;
+	desc->data_ptrs[bd_idx].last = 1;
+	desc->data_ptrs[bd_idx].dma_ext = 0;
+	bd_idx++;
+
+	desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(output, *out_offset);
+	desc->data_ptrs[bd_idx].blen = op->fft.output_sequence_size * ACC_IQ_SIZE;
+	desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_HARD;
+	desc->data_ptrs[bd_idx].last = pwr_en ? 0 : 1;
+	desc->data_ptrs[bd_idx].dma_ext = 0;
+	desc->m2dlen = win_en ? 3 : 2;
+	desc->d2mlen = pwr_en ? 2 : 1;
+	desc->ib_ant_offset = op->fft.input_sequence_size;
+	desc->num_ant = op->fft.num_antennas_log2 - 3;
+
+	for (i = 0; i < RTE_BBDEV_MAX_CS; i++)
+		if (check_bit(op->fft.cs_bitmap, 1 << i))
+			num_cs++;
+	desc->num_cs = num_cs;
+
+	if (pwr_en && pwr) {
+		bd_idx++;
+		desc->data_ptrs[bd_idx].address = rte_pktmbuf_iova_offset(pwr, *pwr_offset);
+		desc->data_ptrs[bd_idx].blen = num_cs * (1 << op->fft.num_antennas_log2) * 4;
+		desc->data_ptrs[bd_idx].blkid = ACC_DMA_BLKID_OUT_SOFT;
+		desc->data_ptrs[bd_idx].last = 1;
+		desc->data_ptrs[bd_idx].dma_ext = 0;
+	}
+	desc->ob_cyc_offset = op->fft.output_sequence_size;
+	desc->ob_ant_offset = op->fft.output_sequence_size * num_cs;
+	desc->op_addr = op;
+	return 0;
+}
 
 /** Enqueue one FFT operation for device. */
 static inline int
@@ -3889,22 +4003,32 @@  vrb_enqueue_fft_one_op(struct acc_queue *q, struct rte_bbdev_fft_op *op,
 		uint16_t total_enqueued_cbs)
 {
 	union acc_dma_desc *desc;
-	struct rte_mbuf *input, *output;
-	uint32_t in_offset, out_offset;
+	struct rte_mbuf *input, *output, *pwr, *win;
+	uint32_t in_offset, out_offset, pwr_offset, win_offset;
 	struct acc_fcw_fft *fcw;
 
 	desc = acc_desc(q, total_enqueued_cbs);
 	input = op->fft.base_input.data;
 	output = op->fft.base_output.data;
+	pwr = op->fft.power_meas_output.data;
+	win = op->fft.dewindowing_input.data;
 	in_offset = op->fft.base_input.offset;
 	out_offset = op->fft.base_output.offset;
+	pwr_offset = op->fft.power_meas_output.offset;
+	win_offset = op->fft.dewindowing_input.offset;
 
 	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
 			((q->sw_ring_head + total_enqueued_cbs) & q->sw_ring_wrap_mask)
 			* ACC_MAX_FCW_SIZE);
 
-	vrb1_fcw_fft_fill(op, fcw);
-	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);
+	if (q->d->device_variant == VRB1_VARIANT) {
+		vrb1_fcw_fft_fill(op, fcw);
+		vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);
+	} else {
+		vrb2_fcw_fft_fill(op, (struct acc_fcw_fft_3 *) fcw);
+		vrb2_dma_desc_fft_fill(op, &desc->req, input, output, win, pwr,
+				&in_offset, &out_offset, &win_offset, &pwr_offset);
+	}
 #ifdef RTE_LIBRTE_BBDEV_DEBUG
 	rte_memdump(stderr, "FCW", &desc->req.fcw_fft,
 			sizeof(desc->req.fcw_fft));