From patchwork Tue Jul 11 10:24:48 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dongdong Liu X-Patchwork-Id: 129446 X-Patchwork-Delegate: ferruh.yigit@amd.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 268B442E44; Tue, 11 Jul 2023 12:28:31 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 41D5942D43; Tue, 11 Jul 2023 12:28:09 +0200 (CEST) Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by mails.dpdk.org (Postfix) with ESMTP id 3981242D40; Tue, 11 Jul 2023 12:28:07 +0200 (CEST) Received: from kwepemi500017.china.huawei.com (unknown [172.30.72.56]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4R0cVb5MDVzVjHM; Tue, 11 Jul 2023 18:26:47 +0800 (CST) Received: from localhost.localdomain (10.28.79.22) by kwepemi500017.china.huawei.com (7.221.188.110) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.27; Tue, 11 Jul 2023 18:27:58 +0800 From: Dongdong Liu To: , , , CC: Subject: [PATCH 5/5] net/hns3: optimize SVE Rx performance Date: Tue, 11 Jul 2023 18:24:48 +0800 Message-ID: <20230711102448.11627-6-liudongdong3@huawei.com> X-Mailer: git-send-email 2.22.0 In-Reply-To: <20230711102448.11627-1-liudongdong3@huawei.com> References: <20230711102448.11627-1-liudongdong3@huawei.com> MIME-Version: 1.0 X-Originating-IP: [10.28.79.22] X-ClientProxiedBy: dggems706-chm.china.huawei.com (10.3.19.183) To kwepemi500017.china.huawei.com (7.221.188.110) X-CFilter-Loop: Reflected X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org From: Huisong Li This patch optimizes SVE Rx performance by the following ways: 1> optimize the calculation of valid BD number. 2> remove a temporary variable (key_fields) 3> use C language to parse some descriptor fields, instead of SVE instruction. 4> small step prefetch descriptor. On the rxonly forwarding mode, the performance of a single queue or 64B packet is improved by ~40%. Signed-off-by: Huisong Li Signed-off-by: Dongdong Liu --- drivers/net/hns3/hns3_rxtx_vec_sve.c | 138 ++++++--------------------- 1 file changed, 28 insertions(+), 110 deletions(-) diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c index 54aef7db8d..0e9abfebec 100644 --- a/drivers/net/hns3/hns3_rxtx_vec_sve.c +++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c @@ -20,40 +20,36 @@ #define BD_SIZE 32 #define BD_FIELD_ADDR_OFFSET 0 -#define BD_FIELD_L234_OFFSET 8 -#define BD_FIELD_XLEN_OFFSET 12 -#define BD_FIELD_RSS_OFFSET 16 -#define BD_FIELD_OL_OFFSET 24 #define BD_FIELD_VALID_OFFSET 28 -typedef struct { - uint32_t l234_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP]; - uint32_t ol_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP]; - uint32_t bd_base_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP]; -} HNS3_SVE_KEY_FIELD_S; - static inline uint32_t hns3_desc_parse_field_sve(struct hns3_rx_queue *rxq, struct rte_mbuf **rx_pkts, - HNS3_SVE_KEY_FIELD_S *key, + struct hns3_desc *rxdp, uint32_t bd_vld_num) { + uint32_t l234_info, ol_info, bd_base_info; uint32_t retcode = 0; int ret, i; for (i = 0; i < (int)bd_vld_num; i++) { /* init rte_mbuf.rearm_data last 64-bit */ rx_pkts[i]->ol_flags = RTE_MBUF_F_RX_RSS_HASH; - - ret = hns3_handle_bdinfo(rxq, rx_pkts[i], key->bd_base_info[i], - key->l234_info[i]); + rx_pkts[i]->hash.rss = rxdp[i].rx.rss_hash; + rx_pkts[i]->pkt_len = rte_le_to_cpu_16(rxdp[i].rx.pkt_len) - + rxq->crc_len; + rx_pkts[i]->data_len = rx_pkts[i]->pkt_len; + + l234_info = rxdp[i].rx.l234_info; + ol_info = rxdp[i].rx.ol_info; + bd_base_info = rxdp[i].rx.bd_base_info; + ret = hns3_handle_bdinfo(rxq, rx_pkts[i], bd_base_info, l234_info); if (unlikely(ret)) { retcode |= 1u << i; continue; } - rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq, - key->l234_info[i], key->ol_info[i]); + rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info); /* Increment bytes counter */ rxq->basic_stats.bytes += rx_pkts[i]->pkt_len; @@ -77,46 +73,16 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq, uint16_t nb_pkts, uint64_t *bd_err_mask) { -#define XLEN_ADJUST_LEN 32 -#define RSS_ADJUST_LEN 16 -#define GEN_VLD_U8_ZIP_INDEX svindex_s8(28, -4) uint16_t rx_id = rxq->next_to_use; struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id]; struct hns3_desc *rxdp = &rxq->rx_ring[rx_id]; - struct hns3_desc *rxdp2; - HNS3_SVE_KEY_FIELD_S key_field; + struct hns3_desc *rxdp2, *next_rxdp; uint64_t bd_valid_num; uint32_t parse_retcode; uint16_t nb_rx = 0; int pos, offset; - uint16_t xlen_adjust[XLEN_ADJUST_LEN] = { - 0, 0xffff, 1, 0xffff, /* 1st mbuf: pkt_len and dat_len */ - 2, 0xffff, 3, 0xffff, /* 2st mbuf: pkt_len and dat_len */ - 4, 0xffff, 5, 0xffff, /* 3st mbuf: pkt_len and dat_len */ - 6, 0xffff, 7, 0xffff, /* 4st mbuf: pkt_len and dat_len */ - 8, 0xffff, 9, 0xffff, /* 5st mbuf: pkt_len and dat_len */ - 10, 0xffff, 11, 0xffff, /* 6st mbuf: pkt_len and dat_len */ - 12, 0xffff, 13, 0xffff, /* 7st mbuf: pkt_len and dat_len */ - 14, 0xffff, 15, 0xffff, /* 8st mbuf: pkt_len and dat_len */ - }; - - uint32_t rss_adjust[RSS_ADJUST_LEN] = { - 0, 0xffff, /* 1st mbuf: rss */ - 1, 0xffff, /* 2st mbuf: rss */ - 2, 0xffff, /* 3st mbuf: rss */ - 3, 0xffff, /* 4st mbuf: rss */ - 4, 0xffff, /* 5st mbuf: rss */ - 5, 0xffff, /* 6st mbuf: rss */ - 6, 0xffff, /* 7st mbuf: rss */ - 7, 0xffff, /* 8st mbuf: rss */ - }; - svbool_t pg32 = svwhilelt_b32(0, HNS3_SVE_DEFAULT_DESCS_PER_LOOP); - svuint16_t xlen_tbl1 = svld1_u16(PG16_256BIT, xlen_adjust); - svuint16_t xlen_tbl2 = svld1_u16(PG16_256BIT, &xlen_adjust[16]); - svuint32_t rss_tbl1 = svld1_u32(PG32_256BIT, rss_adjust); - svuint32_t rss_tbl2 = svld1_u32(PG32_256BIT, &rss_adjust[8]); /* compile-time verifies the xlen_adjust mask */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != @@ -126,30 +92,21 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq, for (pos = 0; pos < nb_pkts; pos += HNS3_SVE_DEFAULT_DESCS_PER_LOOP, rxdp += HNS3_SVE_DEFAULT_DESCS_PER_LOOP) { - svuint64_t vld_clz, mbp1st, mbp2st, mbuf_init; - svuint64_t xlen1st, xlen2st, rss1st, rss2st; - svuint32_t l234, ol, vld, vld2, xlen, rss; - svuint8_t vld_u8; + svuint64_t mbp1st, mbp2st, mbuf_init; + svuint32_t vld; + svbool_t vld_op; /* calc how many bd valid: part 1 */ vld = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp, svindex_u32(BD_FIELD_VALID_OFFSET, BD_SIZE)); - vld2 = svlsl_n_u32_z(pg32, vld, - HNS3_UINT32_BIT - 1 - HNS3_RXD_VLD_B); - vld2 = svreinterpret_u32_s32(svasr_n_s32_z(pg32, - svreinterpret_s32_u32(vld2), HNS3_UINT32_BIT - 1)); + vld = svand_n_u32_z(pg32, vld, BIT(HNS3_RXD_VLD_B)); + vld_op = svcmpne_n_u32(pg32, vld, BIT(HNS3_RXD_VLD_B)); + bd_valid_num = svcntp_b32(pg32, svbrkb_b_z(pg32, vld_op)); + if (bd_valid_num == 0) + break; /* load 4 mbuf pointer */ mbp1st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos]); - - /* calc how many bd valid: part 2 */ - vld_u8 = svtbl_u8(svreinterpret_u8_u32(vld2), - svreinterpret_u8_s8(GEN_VLD_U8_ZIP_INDEX)); - vld_clz = svnot_u64_z(PG64_64BIT, svreinterpret_u64_u8(vld_u8)); - vld_clz = svclz_u64_z(PG64_64BIT, vld_clz); - svst1_u64(PG64_64BIT, &bd_valid_num, vld_clz); - bd_valid_num /= HNS3_UINT8_BIT; - /* load 4 more mbuf pointer */ mbp2st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos + 4]); @@ -159,65 +116,25 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq, /* store 4 mbuf pointer into rx_pkts */ svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos], mbp1st); - - /* load key field to vector reg */ - l234 = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2, - svindex_u32(BD_FIELD_L234_OFFSET, BD_SIZE)); - ol = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2, - svindex_u32(BD_FIELD_OL_OFFSET, BD_SIZE)); - /* store 4 mbuf pointer into rx_pkts again */ svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos + 4], mbp2st); - /* load datalen, pktlen and rss_hash */ - xlen = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2, - svindex_u32(BD_FIELD_XLEN_OFFSET, BD_SIZE)); - rss = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2, - svindex_u32(BD_FIELD_RSS_OFFSET, BD_SIZE)); - - /* store key field to stash buffer */ - svst1_u32(pg32, (uint32_t *)key_field.l234_info, l234); - svst1_u32(pg32, (uint32_t *)key_field.bd_base_info, vld); - svst1_u32(pg32, (uint32_t *)key_field.ol_info, ol); - - /* sub crc_len for pkt_len and data_len */ - xlen = svreinterpret_u32_u16(svsub_n_u16_z(PG16_256BIT, - svreinterpret_u16_u32(xlen), rxq->crc_len)); - /* init mbuf_initializer */ mbuf_init = svdup_n_u64(rxq->mbuf_initializer); - - /* extract datalen, pktlen and rss from xlen and rss */ - xlen1st = svreinterpret_u64_u16( - svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl1)); - xlen2st = svreinterpret_u64_u16( - svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl2)); - rss1st = svreinterpret_u64_u32( - svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl1)); - rss2st = svreinterpret_u64_u32( - svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl2)); - /* save mbuf_initializer */ svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st, offsetof(struct rte_mbuf, rearm_data), mbuf_init); svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st, offsetof(struct rte_mbuf, rearm_data), mbuf_init); - /* save datalen and pktlen and rss */ - svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st, - offsetof(struct rte_mbuf, pkt_len), xlen1st); - svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st, - offsetof(struct rte_mbuf, hash.rss), rss1st); - svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st, - offsetof(struct rte_mbuf, pkt_len), xlen2st); - svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st, - offsetof(struct rte_mbuf, hash.rss), rss2st); - - rte_prefetch_non_temporal(rxdp + - HNS3_SVE_DEFAULT_DESCS_PER_LOOP); + next_rxdp = rxdp + HNS3_SVE_DEFAULT_DESCS_PER_LOOP; + rte_prefetch_non_temporal(next_rxdp); + rte_prefetch_non_temporal(next_rxdp + 2); + rte_prefetch_non_temporal(next_rxdp + 4); + rte_prefetch_non_temporal(next_rxdp + 6); parse_retcode = hns3_desc_parse_field_sve(rxq, &rx_pkts[pos], - &key_field, bd_valid_num); + &rxdp2[offset], bd_valid_num); if (unlikely(parse_retcode)) (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos; @@ -237,6 +154,7 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq, return nb_rx; } + uint16_t hns3_recv_pkts_vec_sve(void *__restrict rx_queue, struct rte_mbuf **__restrict rx_pkts,