[RFC] net/i40e: replace get and put functions

Message ID 20230109145732.7085-1-kamalakshitha.aligeri@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [RFC] net/i40e: replace get and put functions |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation fail Compilation issues

Commit Message

Kamalakshitha Aligeri Jan. 9, 2023, 2:57 p.m. UTC
  Integrated zero-copy get and put API's in mempool cache in i40e PMD

Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com>
---
1. I have replaced the rte_mempool_get_bulk and rte_mempool_put_bulk in
net/i40e with the zero-copy get and put API's

 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 10 +---------
 drivers/net/i40e/i40e_rxtx_vec_common.h | 21 +++++++++++++--------
 drivers/net/i40e/i40e_rxtx_vec_neon.c   | 16 ++++++++++++----
 3 files changed, 26 insertions(+), 21 deletions(-)
  

Comments

Morten Brørup Jan. 13, 2023, 11:50 a.m. UTC | #1
+CC: i40e maintainers

> From: Kamalakshitha Aligeri [mailto:kamalakshitha.aligeri@arm.com]
> Sent: Monday, 9 January 2023 15.58
> 
> Integrated zero-copy get and put API's in mempool cache in i40e PMD
> 
> Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri@arm.com>
> ---
> 1. I have replaced the rte_mempool_get_bulk and rte_mempool_put_bulk in
> net/i40e with the zero-copy get and put API's
> 
>  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 10 +---------
>  drivers/net/i40e/i40e_rxtx_vec_common.h | 21 +++++++++++++--------
>  drivers/net/i40e/i40e_rxtx_vec_neon.c   | 16 ++++++++++++----
>  3 files changed, 26 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 60c97d5331..736bd4650f 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c

i40e_rxq_rearm() also accesses the cache directly, and thus needs rewriting to the new mempool cache API.

> @@ -909,7 +909,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
>  		if (!cache || cache->len == 0)

This is not your doing, but I don't understand the reason for the cache->len == 0 comparison here. Why not store objects in the cache if it is empty? Maybe an old copy-paste bug?

>  			goto normal;
> 
> -		cache_objs = &cache->objs[cache->len];
> +		cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n);
> 
>  		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {

This comparison should be (cache_objs == NULL) instead.

>  			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);

The comment block on lines 919-923 must be deleted too.

> @@ -936,14 +936,6 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue
> *txq)
>  			_mm512_storeu_si512(&cache_objs[copied + 24], d);
>  			copied += 32;
>  		}
> -		cache->len += n;
> -
> -		if (cache->len >= cache->flushthresh) {
> -			rte_mempool_ops_enqueue_bulk
> -				(mp, &cache->objs[cache->size],
> -				cache->len - cache->size);
> -			cache->len = cache->size;
> -		}
>  		goto done;
>  	}
> 
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h
> b/drivers/net/i40e/i40e_rxtx_vec_common.h
> index fe1a6ec75e..4fc4aa0aec 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_common.h
> +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
> @@ -89,23 +89,28 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
> 
>  	/* check DD bits on threshold descriptor */
>  	if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> -			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> +				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
>  			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
>  		return 0;
> 
>  	n = txq->tx_rs_thresh;
> 
> -	 /* first buffer to free from S/W ring is at index
> -	  * tx_next_dd - (tx_rs_thresh-1)
> -	  */
> +	/* first buffer to free from S/W ring is at index
> +	 * tx_next_dd - (tx_rs_thresh-1)
> +	 */
>  	txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
> +	struct rte_mempool *mp = txep[0].mbuf->pool;
> +	struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
> rte_lcore_id());
> +	void **cache_objs;
> +
> +	cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n);

These belong inside the "if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)" block.

> 
>  	if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
>  		for (i = 0; i < n; i++) {
> -			free[i] = txep[i].mbuf;
> +			rte_memcpy(&cache_objs[i], &txep->mbuf, sizeof(struct
> rte_mbuf));

You must copy pointers to mbufs, not mbuf structures. I.e. instead of rte_memcpy(...) do this:

+			cache_objs[i] = txep->mbuf;


>  			/* no need to reset txep[i].mbuf in vector path */
> +			txep++;
>  		}
> -		rte_mempool_put_bulk(free[0]->pool, (void **)free, n);
>  		goto done;
>  	}
> 
> @@ -120,8 +125,8 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
>  					free[nb_free++] = m;
>  				} else {
>  					rte_mempool_put_bulk(free[0]->pool,
> -							     (void *)free,
> -							     nb_free);
> +							(void *)free,
> +							nb_free);
>  					free[0] = m;
>  					nb_free = 1;
>  				}
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c
> b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> index 12e6f1cbcb..ebc2161b84 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c


> @@ -28,15 +28,19 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>  	uint64x2_t dma_addr0, dma_addr1;
>  	uint64x2_t zero = vdupq_n_u64(0);
>  	uint64_t paddr;
> +	uint32_t index, n;

No need for "index"; just reuse "i" instead.

No need for "n"; just use RTE_I40E_RXQ_REARM_THRESH.

> 
> +	n = RTE_I40E_RXQ_REARM_THRESH;
>  	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +	struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq-
> >mp, rte_lcore_id());
> +	void **cache_objs;

You must add support for mempools without cache: if (cache == NULL) ...

> +
> +	cache_objs = rte_mempool_cache_zc_get_bulk(cache, rxq->mp, n);
> 
>  	/* Pull 'n' more MBUFs into the software ring */
> -	if (unlikely(rte_mempool_get_bulk(rxq->mp,
> -					  (void *)rxep,
> -					  RTE_I40E_RXQ_REARM_THRESH) < 0)) {
> +	if (unlikely(!cache_objs)) {
>  		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> -		    rxq->nb_rx_desc) {
> +				rxq->nb_rx_desc) {
>  			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
>  				rxep[i].mbuf = &rxq->fake_mbuf;
>  				vst1q_u64((uint64_t *)&rxdp[i].read, zero);
> @@ -46,6 +50,10 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>  			RTE_I40E_RXQ_REARM_THRESH;
>  		return;
>  	}
> +	for (index = 0; index < n; index++) {
> +		rxep->mbuf = cache_objs[index];
> +		rxep++;
> +	}

Please note that struct i40e_rx_entry is essentially the same as struct rte_mbuf [1]. This was taken advantage of in the rte_mempool_get_bulk() above.

[1]: https://elixir.bootlin.com/dpdk/latest/source/drivers/net/i40e/i40e_rxtx.h#L77

It means that the loop that copies the mbuf pointers from the cache_objs[] array to the rxep[] array can be replaced by:
rte_memcpy(rxep, cache_objs, RTE_I40E_RXQ_REARM_THRESH * sizeof(void *));

> 
>  	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
>  	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> --
> 2.25.1
>
  

Patch

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 60c97d5331..736bd4650f 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -909,7 +909,7 @@  i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 		if (!cache || cache->len == 0)
 			goto normal;
 
-		cache_objs = &cache->objs[cache->len];
+		cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n);
 
 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
@@ -936,14 +936,6 @@  i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
 			copied += 32;
 		}
-		cache->len += n;
-
-		if (cache->len >= cache->flushthresh) {
-			rte_mempool_ops_enqueue_bulk
-				(mp, &cache->objs[cache->size],
-				cache->len - cache->size);
-			cache->len = cache->size;
-		}
 		goto done;
 	}
 
diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h
index fe1a6ec75e..4fc4aa0aec 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_common.h
+++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
@@ -89,23 +89,28 @@  i40e_tx_free_bufs(struct i40e_tx_queue *txq)
 
 	/* check DD bits on threshold descriptor */
 	if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
-			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
 			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
 		return 0;
 
 	n = txq->tx_rs_thresh;
 
-	 /* first buffer to free from S/W ring is at index
-	  * tx_next_dd - (tx_rs_thresh-1)
-	  */
+	/* first buffer to free from S/W ring is at index
+	 * tx_next_dd - (tx_rs_thresh-1)
+	 */
 	txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+	struct rte_mempool *mp = txep[0].mbuf->pool;
+	struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, rte_lcore_id());
+	void **cache_objs;
+
+	cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n);
 
 	if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
 		for (i = 0; i < n; i++) {
-			free[i] = txep[i].mbuf;
+			rte_memcpy(&cache_objs[i], &txep->mbuf, sizeof(struct rte_mbuf));
 			/* no need to reset txep[i].mbuf in vector path */
+			txep++;
 		}
-		rte_mempool_put_bulk(free[0]->pool, (void **)free, n);
 		goto done;
 	}
 
@@ -120,8 +125,8 @@  i40e_tx_free_bufs(struct i40e_tx_queue *txq)
 					free[nb_free++] = m;
 				} else {
 					rte_mempool_put_bulk(free[0]->pool,
-							     (void *)free,
-							     nb_free);
+							(void *)free,
+							nb_free);
 					free[0] = m;
 					nb_free = 1;
 				}
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 12e6f1cbcb..ebc2161b84 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -28,15 +28,19 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	uint64x2_t dma_addr0, dma_addr1;
 	uint64x2_t zero = vdupq_n_u64(0);
 	uint64_t paddr;
+	uint32_t index, n;
 
+	n = RTE_I40E_RXQ_REARM_THRESH;
 	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+	struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
+	void **cache_objs;
+
+	cache_objs = rte_mempool_cache_zc_get_bulk(cache, rxq->mp, n);
 
 	/* Pull 'n' more MBUFs into the software ring */
-	if (unlikely(rte_mempool_get_bulk(rxq->mp,
-					  (void *)rxep,
-					  RTE_I40E_RXQ_REARM_THRESH) < 0)) {
+	if (unlikely(!cache_objs)) {
 		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-		    rxq->nb_rx_desc) {
+				rxq->nb_rx_desc) {
 			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				vst1q_u64((uint64_t *)&rxdp[i].read, zero);
@@ -46,6 +50,10 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			RTE_I40E_RXQ_REARM_THRESH;
 		return;
 	}
+	for (index = 0; index < n; index++) {
+		rxep->mbuf = cache_objs[index];
+		rxep++;
+	}
 
 	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
 	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {