From patchwork Tue Jul 11 10:24:47 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dongdong Liu X-Patchwork-Id: 129444 X-Patchwork-Delegate: ferruh.yigit@amd.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 31C6742E44; Tue, 11 Jul 2023 12:28:17 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 8188042D38; Tue, 11 Jul 2023 12:28:01 +0200 (CEST) Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187]) by mails.dpdk.org (Postfix) with ESMTP id 0FAC34003C; Tue, 11 Jul 2023 12:27:56 +0200 (CEST) Received: from kwepemi500017.china.huawei.com (unknown [172.30.72.54]) by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4R0cSS18bCztRF1; Tue, 11 Jul 2023 18:24:56 +0800 (CST) Received: from localhost.localdomain (10.28.79.22) by kwepemi500017.china.huawei.com (7.221.188.110) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.27; Tue, 11 Jul 2023 18:27:54 +0800 From: Dongdong Liu To: , , , CC: Subject: [PATCH 4/5] net/hns3: optimize the rearm mbuf function for SVE Rx Date: Tue, 11 Jul 2023 18:24:47 +0800 Message-ID: <20230711102448.11627-5-liudongdong3@huawei.com> X-Mailer: git-send-email 2.22.0 In-Reply-To: <20230711102448.11627-1-liudongdong3@huawei.com> References: <20230711102448.11627-1-liudongdong3@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.28.79.22] X-ClientProxiedBy: dggems701-chm.china.huawei.com (10.3.19.178) To kwepemi500017.china.huawei.com (7.221.188.110) X-CFilter-Loop: Reflected X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: Huisong Li Use hns3_rxq_rearm_mbuf() to replace the hns3_rxq_rearm_mbuf_sve() to optimize the performance of SVE Rx. On the rxonly forwarding mode, the performance of a single queue for 64B packet is improved by ~15%. Signed-off-by: Huisong Li Signed-off-by: Dongdong Liu --- drivers/net/hns3/hns3_rxtx_vec.c | 51 --------------------------- drivers/net/hns3/hns3_rxtx_vec.h | 51 +++++++++++++++++++++++++++ drivers/net/hns3/hns3_rxtx_vec_sve.c | 52 ++-------------------------- 3 files changed, 53 insertions(+), 101 deletions(-) diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c index cd9264d91b..9708ec614e 100644 --- a/drivers/net/hns3/hns3_rxtx_vec.c +++ b/drivers/net/hns3/hns3_rxtx_vec.c @@ -55,57 +55,6 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_tx; } -static inline void -hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq) -{ -#define REARM_LOOP_STEP_NUM 4 - struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start]; - struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start; - uint64_t dma_addr; - int i; - - if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, - HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) { - rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++; - return; - } - - for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM, - rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) { - if (likely(i < - HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) { - rte_prefetch_non_temporal(rxep[4].mbuf); - rte_prefetch_non_temporal(rxep[5].mbuf); - rte_prefetch_non_temporal(rxep[6].mbuf); - rte_prefetch_non_temporal(rxep[7].mbuf); - } - - dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf); - rxdp[0].addr = rte_cpu_to_le_64(dma_addr); - rxdp[0].rx.bd_base_info = 0; - - dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf); - rxdp[1].addr = rte_cpu_to_le_64(dma_addr); - rxdp[1].rx.bd_base_info = 0; - - dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf); - rxdp[2].addr = rte_cpu_to_le_64(dma_addr); - rxdp[2].rx.bd_base_info = 0; - - dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf); - rxdp[3].addr = rte_cpu_to_le_64(dma_addr); - rxdp[3].rx.bd_base_info = 0; - } - - rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH; - if (rxq->rx_rearm_start >= rxq->nb_rx_desc) - rxq->rx_rearm_start = 0; - - rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH; - - hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH); -} - uint16_t hns3_recv_pkts_vec(void *__restrict rx_queue, struct rte_mbuf **__restrict rx_pkts, diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h index 2c8a91921e..a9a6774294 100644 --- a/drivers/net/hns3/hns3_rxtx_vec.h +++ b/drivers/net/hns3/hns3_rxtx_vec.h @@ -94,4 +94,55 @@ hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts, return count; } + +static inline void +hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq) +{ +#define REARM_LOOP_STEP_NUM 4 + struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start]; + struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start; + uint64_t dma_addr; + int i; + + if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, + HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) { + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++; + return; + } + + for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM, + rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) { + if (likely(i < + HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) { + rte_prefetch_non_temporal(rxep[4].mbuf); + rte_prefetch_non_temporal(rxep[5].mbuf); + rte_prefetch_non_temporal(rxep[6].mbuf); + rte_prefetch_non_temporal(rxep[7].mbuf); + } + + dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf); + rxdp[0].addr = rte_cpu_to_le_64(dma_addr); + rxdp[0].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf); + rxdp[1].addr = rte_cpu_to_le_64(dma_addr); + rxdp[1].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf); + rxdp[2].addr = rte_cpu_to_le_64(dma_addr); + rxdp[2].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf); + rxdp[3].addr = rte_cpu_to_le_64(dma_addr); + rxdp[3].rx.bd_base_info = 0; + } + + rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH; + if (rxq->rx_rearm_start >= rxq->nb_rx_desc) + rxq->rx_rearm_start = 0; + + rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH; + + hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH); +} #endif /* HNS3_RXTX_VEC_H */ diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c index 5011544e07..54aef7db8d 100644 --- a/drivers/net/hns3/hns3_rxtx_vec_sve.c +++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c @@ -237,54 +237,6 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq, return nb_rx; } -static inline void -hns3_rxq_rearm_mbuf_sve(struct hns3_rx_queue *rxq) -{ -#define REARM_LOOP_STEP_NUM 4 - struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start]; - struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start; - struct hns3_entry *rxep_tmp = rxep; - int i; - - if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, - HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) { - rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++; - return; - } - - for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM, - rxep_tmp += REARM_LOOP_STEP_NUM) { - svuint64_t prf = svld1_u64(PG64_256BIT, (uint64_t *)rxep_tmp); - svprfd_gather_u64base(PG64_256BIT, prf, SV_PLDL1STRM); - } - - for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM, - rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) { - uint64_t iova[REARM_LOOP_STEP_NUM]; - iova[0] = rte_mbuf_iova_get(rxep[0].mbuf); - iova[1] = rte_mbuf_iova_get(rxep[1].mbuf); - iova[2] = rte_mbuf_iova_get(rxep[2].mbuf); - iova[3] = rte_mbuf_iova_get(rxep[3].mbuf); - svuint64_t siova = svld1_u64(PG64_256BIT, iova); - siova = svadd_n_u64_z(PG64_256BIT, siova, RTE_PKTMBUF_HEADROOM); - svuint64_t ol_base = svdup_n_u64(0); - svst1_scatter_u64offset_u64(PG64_256BIT, - (uint64_t *)&rxdp[0].addr, - svindex_u64(BD_FIELD_ADDR_OFFSET, BD_SIZE), siova); - svst1_scatter_u64offset_u64(PG64_256BIT, - (uint64_t *)&rxdp[0].addr, - svindex_u64(BD_FIELD_OL_OFFSET, BD_SIZE), ol_base); - } - - rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH; - if (rxq->rx_rearm_start >= rxq->nb_rx_desc) - rxq->rx_rearm_start = 0; - - rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH; - - hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH); -} - uint16_t hns3_recv_pkts_vec_sve(void *__restrict rx_queue, struct rte_mbuf **__restrict rx_pkts, @@ -300,7 +252,7 @@ hns3_recv_pkts_vec_sve(void *__restrict rx_queue, nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_SVE_DEFAULT_DESCS_PER_LOOP); if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH) - hns3_rxq_rearm_mbuf_sve(rxq); + hns3_rxq_rearm_mbuf(rxq); if (unlikely(!(rxdp->rx.bd_base_info & rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B)))) @@ -331,7 +283,7 @@ hns3_recv_pkts_vec_sve(void *__restrict rx_queue, break; if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH) - hns3_rxq_rearm_mbuf_sve(rxq); + hns3_rxq_rearm_mbuf(rxq); } return nb_rx;