@@ -59,6 +59,8 @@ New Features
* Optimize mbuf rearm sequence.
* Updated Tx queue mbuf free thresholds from 128 to 256 for better performance.
+ * Updated Rx queue mbuf refill routine to use mempool alloc and reorder it
+ to avoid mbuf write commits.
* Added optimized SSE Rx routines.
* Added optimized AVX2 Rx routines.
@@ -76,12 +76,12 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
uint16_t new_pkts;
new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
- cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
/* Refill RX buffers */
if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
cnxk_ep_rx_refill(droq);
+ cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
return new_pkts;
}
@@ -21,13 +21,16 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
uint32_t i;
int rc;
- rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+ rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
if (unlikely(rc)) {
droq->stats.rx_alloc_failure++;
return rc;
}
for (i = 0; i < count; i++) {
+ rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+ if (i < count - 1)
+ rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
buf = recv_buf_list[refill_idx];
desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
refill_idx++;
@@ -42,9 +45,9 @@ cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
static inline void
cnxk_ep_rx_refill(struct otx_ep_droq *droq)
{
- uint32_t desc_refilled = 0, count;
- uint32_t nb_desc = droq->nb_desc;
+ const uint32_t nb_desc = droq->nb_desc;
uint32_t refill_idx = droq->refill_idx;
+ uint32_t desc_refilled = 0, count;
int rc;
if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@ cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
return RTE_MIN(nb_pkts, droq->pkts_pending);
}
+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
static __rte_always_inline void
cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
{
@@ -147,7 +152,7 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
void *));
mbuf = recv_buf_list[read_idx];
- info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+ info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
pkt_len = rte_bswap16(info->length >> 48);
mbuf->pkt_len = pkt_len;
@@ -49,7 +49,7 @@ cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
/* Load rearm data and packet length for shuffle. */
for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
data[i] = _mm256_set_epi64x(0,
- rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+ cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
0, rearm_data);
/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@ cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
uint16_t new_pkts, vpkts;
+ /* Refill RX buffers */
+ if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+ cnxk_ep_rx_refill(droq);
+
new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
- /* Refill RX buffers */
- if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
- cnxk_ep_rx_refill(droq);
-
return new_pkts;
}
@@ -99,11 +99,6 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
uint16_t new_pkts, vpkts;
- new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
- vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
- cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
- cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
/* Refill RX buffers */
if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@ cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
rte_write32(0, droq->pkts_credit_reg);
}
+ new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+ vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+ cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+ cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
return new_pkts;
}
@@ -18,13 +18,15 @@ static __rte_always_inline void
cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
{
struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
- uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
- uint32_t idx0, idx1, idx2, idx3;
+ uint32_t read_idx = droq->read_idx;
struct rte_mbuf *m0, *m1, *m2, *m3;
uint16_t nb_desc = droq->nb_desc;
+ uint32_t idx0, idx1, idx2, idx3;
uint16_t pkts = 0;
+ __m128i bytes;
idx0 = read_idx;
+ bytes = _mm_setzero_si128();
while (pkts < new_pkts) {
const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
m3 = recv_buf_list[idx3];
/* Load packet size big-endian. */
- s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
- rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
- rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
- rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+ s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+ cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+ cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+ cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
/* Convert to little-endian. */
s01 = _mm_shuffle_epi8(s01, bswap_mask);
- /* Horizontal add. */
- bytes_rsvd += hadd(s01);
+ /* Vertical add, consolidate outside loop */
+ bytes = _mm_add_epi32(bytes, s01);
/* Segregate to packet length and data length. */
s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@ cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
droq->pkts_pending -= new_pkts;
/* Stats */
droq->stats.pkts_received += new_pkts;
- droq->stats.bytes_received += bytes_rsvd;
+ droq->stats.bytes_received += hadd(bytes);
}
uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@ cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
uint16_t new_pkts, vpkts;
+ /* Refill RX buffers */
+ if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+ cnxk_ep_rx_refill(droq);
+
new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
- /* Refill RX buffers */
- if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
- cnxk_ep_rx_refill(droq);
-
return new_pkts;
}
@@ -106,11 +108,6 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
uint16_t new_pkts, vpkts;
- new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
- vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
- cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
- cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
/* Refill RX buffers */
if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@ cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
rte_write32(0, droq->pkts_credit_reg);
}
+ new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+ vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+ cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+ cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
return new_pkts;
}
@@ -21,7 +21,7 @@
/* SDP_LENGTH_S specifies packet length and is of 8-byte size */
#define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD 64
#define OTX2_SDP_REQUEST_ISM (0x1ULL << 63)
static inline uint32_t