[v2,1/2] net/memif: add a Rx fast path
Checks
Commit Message
For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to considerable performance uplift.
The Rx fast path would not change mbuf's behavior of storing
memif buf.
When memif <= buffer size, Rx chooses the fast memcpy path,
otherwise it would choose the original path.
Test with 1p1q on Ampere Altra AArch64 server,
----------------------------------------------
| buf size | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain | 4.30% | -0.52% |
----------------------------------------------
| zc gain | 2.46% | 0.70% |
----------------------------------------------
Test with 1p1q on Cascade Lake Xeon X86server,
----------------------------------------------
| buf size | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain | 2.13% | -1.40% |
----------------------------------------------
| zc gain | 0.18% | 0.48% |
----------------------------------------------
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
drivers/net/memif/rte_eth_memif.c | 123 ++++++++++++++++++++----------
1 file changed, 83 insertions(+), 40 deletions(-)
Comments
On Fri, 1 Jul 2022 10:28:14 +0000
Joyce Kong <joyce.kong@arm.com> wrote:
> n_slots = last_slot - cur_slot;
> + if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> + while (n_slots && n_rx_pkts < nb_pkts) {
> + mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> + if (unlikely(mbuf_head == NULL))
> + goto no_free_bufs;
> + mbuf = mbuf_head;
> +
> +next_slot1:
> + mbuf->port = mq->in_port;
> + s0 = cur_slot & mask;
> + d0 = &ring->desc[s0];
>
You might get additional speedup by doing bulk allocation.
If you know you are going to get N packets than rte_pktmbuf_alloc_bulk()
might speed it up?
@@ -341,67 +341,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
if (cur_slot == last_slot)
goto refill;
n_slots = last_slot - cur_slot;
+ if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+ while (n_slots && n_rx_pkts < nb_pkts) {
+ mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+ if (unlikely(mbuf_head == NULL))
+ goto no_free_bufs;
+ mbuf = mbuf_head;
+
+next_slot1:
+ mbuf->port = mq->in_port;
+ s0 = cur_slot & mask;
+ d0 = &ring->desc[s0];
- while (n_slots && n_rx_pkts < nb_pkts) {
- mbuf_head = rte_pktmbuf_alloc(mq->mempool);
- if (unlikely(mbuf_head == NULL))
- goto no_free_bufs;
- mbuf = mbuf_head;
- mbuf->port = mq->in_port;
- dst_off = 0;
+ cp_len = d0->length;
-next_slot:
- s0 = cur_slot & mask;
- d0 = &ring->desc[s0];
+ rte_pktmbuf_data_len(mbuf) = cp_len;
+ rte_pktmbuf_pkt_len(mbuf) = cp_len;
+ if (mbuf != mbuf_head)
+ rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
- src_len = d0->length;
- src_off = 0;
+ rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+ (uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
- do {
- dst_len = mbuf_size - dst_off;
- if (dst_len == 0) {
- dst_off = 0;
- dst_len = mbuf_size;
+ cur_slot++;
+ n_slots--;
- /* store pointer to tail */
+ if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
mbuf_tail = mbuf;
mbuf = rte_pktmbuf_alloc(mq->mempool);
if (unlikely(mbuf == NULL))
goto no_free_bufs;
- mbuf->port = mq->in_port;
ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
if (unlikely(ret < 0)) {
MIF_LOG(ERR, "number-of-segments-overflow");
rte_pktmbuf_free(mbuf);
goto no_free_bufs;
}
+ goto next_slot1;
}
- cp_len = RTE_MIN(dst_len, src_len);
- rte_pktmbuf_data_len(mbuf) += cp_len;
- rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
- if (mbuf != mbuf_head)
- rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+ mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+ *bufs++ = mbuf_head;
+ n_rx_pkts++;
+ }
+ } else {
+ while (n_slots && n_rx_pkts < nb_pkts) {
+ mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+ if (unlikely(mbuf_head == NULL))
+ goto no_free_bufs;
+ mbuf = mbuf_head;
+ mbuf->port = mq->in_port;
+
+next_slot2:
+ s0 = cur_slot & mask;
+ d0 = &ring->desc[s0];
- rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
- dst_off),
- (uint8_t *)memif_get_buffer(proc_private, d0) +
- src_off, cp_len);
+ src_len = d0->length;
+ dst_off = 0;
+ src_off = 0;
- src_off += cp_len;
- dst_off += cp_len;
- src_len -= cp_len;
- } while (src_len);
+ do {
+ dst_len = mbuf_size - dst_off;
+ if (dst_len == 0) {
+ dst_off = 0;
+ dst_len = mbuf_size;
+
+ /* store pointer to tail */
+ mbuf_tail = mbuf;
+ mbuf = rte_pktmbuf_alloc(mq->mempool);
+ if (unlikely(mbuf == NULL))
+ goto no_free_bufs;
+ mbuf->port = mq->in_port;
+ ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+ if (unlikely(ret < 0)) {
+ MIF_LOG(ERR, "number-of-segments-overflow");
+ rte_pktmbuf_free(mbuf);
+ goto no_free_bufs;
+ }
+ }
+ cp_len = RTE_MIN(dst_len, src_len);
- cur_slot++;
- n_slots--;
+ rte_pktmbuf_data_len(mbuf) += cp_len;
+ rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+ if (mbuf != mbuf_head)
+ rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
- if (d0->flags & MEMIF_DESC_FLAG_NEXT)
- goto next_slot;
+ rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+ dst_off),
+ (uint8_t *)memif_get_buffer(proc_private, d0) +
+ src_off, cp_len);
- mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
- *bufs++ = mbuf_head;
- n_rx_pkts++;
+ src_off += cp_len;
+ dst_off += cp_len;
+ src_len -= cp_len;
+ } while (src_len);
+
+ cur_slot++;
+ n_slots--;
+
+ if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+ goto next_slot2;
+
+ mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+ *bufs++ = mbuf_head;
+ n_rx_pkts++;
+ }
}
no_free_bufs:
@@ -694,7 +738,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
return n_tx_pkts;
}
-
static int
memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,