[v3] net/mlx5: relaxed ordering for multi-packet RQ buffer refcnt
Checks
Commit Message
Use c11 atomics with explicit ordering instead of the rte_atomic ops
which enforce unnecessary barriers on aarch64.
Signed-off-by: Phil Yang <phil.yang@arm.com>
---
v3:
Split from the patchset:
http://patchwork.dpdk.org/cover/68159/
drivers/net/mlx5/mlx5_rxq.c | 2 +-
drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
drivers/net/mlx5/mlx5_rxtx.h | 2 +-
3 files changed, 11 insertions(+), 9 deletions(-)
Comments
Hi,
We are also doing C11 atomics converting for other components.
Your insight would be much appreciated.
Thanks,
Phil Yang
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> Sent: Tuesday, June 23, 2020 4:27 PM
> To: dev@dpdk.org
> Cc: matan@mellanox.com; shahafs@mellanox.com;
> viacheslavo@mellanox.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> <nd@arm.com>
> Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-packet
> RQ buffer refcnt
>
> Use c11 atomics with explicit ordering instead of the rte_atomic ops
> which enforce unnecessary barriers on aarch64.
>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> ---
> v3:
> Split from the patchset:
> http://patchwork.dpdk.org/cover/68159/
>
> drivers/net/mlx5/mlx5_rxq.c | 2 +-
> drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
> drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> 3 files changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> index dda0073..7f487f1 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> void *opaque_arg,
>
> memset(_m, 0, sizeof(*buf));
> buf->mp = mp;
> - rte_atomic16_set(&buf->refcnt, 1);
> + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> for (j = 0; j != strd_n; ++j) {
> shinfo = &buf->shinfos[j];
> shinfo->free_cb = mlx5_mprq_buf_free_cb;
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index e4106bf..f0eda88 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> __rte_unused, void *opaque)
> {
> struct mlx5_mprq_buf *buf = opaque;
>
> - if (rte_atomic16_read(&buf->refcnt) == 1) {
> + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> rte_mempool_put(buf->mp, buf);
> - } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> - rte_atomic16_set(&buf->refcnt, 1);
> + } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> + __ATOMIC_RELAXED) == 0)) {
> + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> rte_mempool_put(buf->mp, buf);
> }
> }
> @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>
> if (consumed_strd == strd_n) {
> /* Replace WQE only if the buffer is still in use. */
> - if (rte_atomic16_read(&buf->refcnt) > 1) {
> + if (__atomic_load_n(&buf->refcnt,
> + __ATOMIC_RELAXED) > 1) {
> mprq_buf_replace(rxq, rq_ci & wq_mask,
> strd_n);
> /* Release the old buffer. */
> mlx5_mprq_buf_free(buf);
> @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
> void *buf_addr;
>
> /* Increment the refcnt of the whole chunk. */
> - rte_atomic16_add_return(&buf->refcnt, 1);
> - MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> >refcnt) <=
> - strd_n + 1);
> + __atomic_add_fetch(&buf->refcnt, 1,
> __ATOMIC_ACQUIRE);
> + MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> + __ATOMIC_RELAXED) <= strd_n + 1);
> buf_addr = RTE_PTR_SUB(addr,
> RTE_PKTMBUF_HEADROOM);
> /*
> * MLX5 device doesn't use iova but it is necessary in
> a
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 26621ff..0fc15f3 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -78,7 +78,7 @@ struct rxq_zip {
> /* Multi-Packet RQ buffer header. */
> struct mlx5_mprq_buf {
> struct rte_mempool *mp;
> - rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> + uint16_t refcnt; /* Atomically accessed refcnt. */
> uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> packet. */
> struct rte_mbuf_ext_shared_info shinfos[];
> /*
> --
> 2.7.4
Hi Phil Yang, we noticed that this patch gives us 10% of performance degradation on ARM.
x86 seems to be unaffected though. Do you know what may be the reason of this behavior?
Regards,
Alex
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> Sent: Sunday, July 12, 2020 23:02
> To: Matan Azrad <matan@mellanox.com>; Shahaf Shuler
> <shahafs@mellanox.com>; Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> drc@linux.vnet.ibm.com; nd <nd@arm.com>; Phil Yang <Phil.Yang@arm.com>;
> dev@dpdk.org; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-packet
> RQ buffer refcnt
>
> Hi,
>
> We are also doing C11 atomics converting for other components.
> Your insight would be much appreciated.
>
> Thanks,
> Phil Yang
>
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > Sent: Tuesday, June 23, 2020 4:27 PM
> > To: dev@dpdk.org
> > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > <nd@arm.com>
> > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi-packet RQ buffer refcnt
> >
> > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > which enforce unnecessary barriers on aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > ---
> > v3:
> > Split from the patchset:
> > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> work.dpdk.org%2Fcover%2F68159%2F&data=02%7C01%7Cakozyrev%40m
> ellano
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d1
> 49256
> >
> f461b%7C0%7C0%7C637302061620808255&sdata=mRXbgPi6HyrVtP04Vl7
> Bx8lD0
> > trVP7noQlpOD7gBoTQ%3D&reserved=0
> >
> > drivers/net/mlx5/mlx5_rxq.c | 2 +-
> > drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
> > drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> > 3 files changed, 11 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > index dda0073..7f487f1 100644
> > --- a/drivers/net/mlx5/mlx5_rxq.c
> > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp, void
> > *opaque_arg,
> >
> > memset(_m, 0, sizeof(*buf));
> > buf->mp = mp;
> > - rte_atomic16_set(&buf->refcnt, 1);
> > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > for (j = 0; j != strd_n; ++j) {
> > shinfo = &buf->shinfos[j];
> > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > e4106bf..f0eda88 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> __rte_unused,
> > void *opaque) {
> > struct mlx5_mprq_buf *buf = opaque;
> >
> > - if (rte_atomic16_read(&buf->refcnt) == 1) {
> > + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > rte_mempool_put(buf->mp, buf);
> > - } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > - rte_atomic16_set(&buf->refcnt, 1);
> > + } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > + __ATOMIC_RELAXED) == 0)) {
> > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > rte_mempool_put(buf->mp, buf);
> > }
> > }
> > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >
> > if (consumed_strd == strd_n) {
> > /* Replace WQE only if the buffer is still in use. */
> > - if (rte_atomic16_read(&buf->refcnt) > 1) {
> > + if (__atomic_load_n(&buf->refcnt,
> > + __ATOMIC_RELAXED) > 1) {
> > mprq_buf_replace(rxq, rq_ci & wq_mask,
> strd_n);
> > /* Release the old buffer. */
> > mlx5_mprq_buf_free(buf);
> > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> > void *buf_addr;
> >
> > /* Increment the refcnt of the whole chunk. */
> > - rte_atomic16_add_return(&buf->refcnt, 1);
> > - MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > >refcnt) <=
> > - strd_n + 1);
> > + __atomic_add_fetch(&buf->refcnt, 1,
> > __ATOMIC_ACQUIRE);
> > + MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > + __ATOMIC_RELAXED) <= strd_n + 1);
> > buf_addr = RTE_PTR_SUB(addr,
> > RTE_PKTMBUF_HEADROOM);
> > /*
> > * MLX5 device doesn't use iova but it is necessary in a
> diff
> > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index 26621ff..0fc15f3 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -78,7 +78,7 @@ struct rxq_zip {
> > /* Multi-Packet RQ buffer header. */
> > struct mlx5_mprq_buf {
> > struct rte_mempool *mp;
> > - rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > + uint16_t refcnt; /* Atomically accessed refcnt. */
> > uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> packet.
> > */
> > struct rte_mbuf_ext_shared_info shinfos[];
> > /*
> > --
> > 2.7.4
Alexander Kozyrev <akozyrev@mellanox.com> writes:
<...>
> Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-
> packet RQ buffer refcnt
>
> Hi Phil Yang, we noticed that this patch gives us 10% of performance
> degradation on ARM.
> x86 seems to be unaffected though. Do you know what may be the reason
> of this behavior?
Hi Alexander,
Thanks for your feedback.
This patch removed some expensive memory barriers on aarch64, it should get better performance.
I am not sure the root cause of this degradation now, I will start the investigation. We can profiling this issue together.
Could you share your test case(including your testbed configuration) with us?
Thanks,
Phil
>
> Regards,
> Alex
>
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > Sent: Sunday, July 12, 2020 23:02
> > To: Matan Azrad <matan@mellanox.com>; Shahaf Shuler
> > <shahafs@mellanox.com>; Slava Ovsiienko <viacheslavo@mellanox.com>
> > Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > drc@linux.vnet.ibm.com; nd <nd@arm.com>; Phil Yang
> <Phil.Yang@arm.com>;
> > dev@dpdk.org; nd <nd@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-
> packet
> > RQ buffer refcnt
> >
> > Hi,
> >
> > We are also doing C11 atomics converting for other components.
> > Your insight would be much appreciated.
> >
> > Thanks,
> > Phil Yang
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > To: dev@dpdk.org
> > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > <nd@arm.com>
> > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > which enforce unnecessary barriers on aarch64.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > ---
> > > v3:
> > > Split from the patchset:
> > >
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> > >
> >
> work.dpdk.org%2Fcover%2F68159%2F&data=02%7C01%7Cakozyrev%40
> m
> > ellano
> > >
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d
> 1
> > 49256
> > >
> >
> f461b%7C0%7C0%7C637302061620808255&sdata=mRXbgPi6HyrVtP04Vl
> 7
> > Bx8lD0
> > > trVP7noQlpOD7gBoTQ%3D&reserved=0
> > >
> > > drivers/net/mlx5/mlx5_rxq.c | 2 +-
> > > drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
> > > drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> > > 3 files changed, 11 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > > index dda0073..7f487f1 100644
> > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> void
> > > *opaque_arg,
> > >
> > > memset(_m, 0, sizeof(*buf));
> > > buf->mp = mp;
> > > - rte_atomic16_set(&buf->refcnt, 1);
> > > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > for (j = 0; j != strd_n; ++j) {
> > > shinfo = &buf->shinfos[j];
> > > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > e4106bf..f0eda88 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > __rte_unused,
> > > void *opaque) {
> > > struct mlx5_mprq_buf *buf = opaque;
> > >
> > > - if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > rte_mempool_put(buf->mp, buf);
> > > - } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > - rte_atomic16_set(&buf->refcnt, 1);
> > > + } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > + __ATOMIC_RELAXED) == 0)) {
> > > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > rte_mempool_put(buf->mp, buf);
> > > }
> > > }
> > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >
> > > if (consumed_strd == strd_n) {
> > > /* Replace WQE only if the buffer is still in use. */
> > > - if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > + if (__atomic_load_n(&buf->refcnt,
> > > + __ATOMIC_RELAXED) > 1) {
> > > mprq_buf_replace(rxq, rq_ci & wq_mask,
> > strd_n);
> > > /* Release the old buffer. */
> > > mlx5_mprq_buf_free(buf);
> > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > > void *buf_addr;
> > >
> > > /* Increment the refcnt of the whole chunk. */
> > > - rte_atomic16_add_return(&buf->refcnt, 1);
> > > - MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > >refcnt) <=
> > > - strd_n + 1);
> > > + __atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_ACQUIRE);
> > > + MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > > /*
> > > * MLX5 device doesn't use iova but it is necessary in
> a
> > diff
> > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > index 26621ff..0fc15f3 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > /* Multi-Packet RQ buffer header. */
> > > struct mlx5_mprq_buf {
> > > struct rte_mempool *mp;
> > > - rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > + uint16_t refcnt; /* Atomically accessed refcnt. */
> > > uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > packet.
> > > */
> > > struct rte_mbuf_ext_shared_info shinfos[];
> > > /*
> > > --
> > > 2.7.4
> Phil Yang <Phil.Yang@arm.com> writes:
>
> <...>
> > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi- packet RQ buffer refcnt
> >
> > Hi Phil Yang, we noticed that this patch gives us 10% of performance
> > degradation on ARM.
> > x86 seems to be unaffected though. Do you know what may be the reason
> > of this behavior?
>
> Hi Alexander,
>
> Thanks for your feedback.
> This patch removed some expensive memory barriers on aarch64, it should get
> better performance.
> I am not sure the root cause of this degradation now, I will start the
> investigation. We can profiling this issue together.
> Could you share your test case(including your testbed configuration) with us?
>
> Thanks,
> Phil
I'm surprised too, Phil, but looks like it is actually making things worse. I used Connect-X 6DX on aarch64:
Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux
Traffic generator sends 60 bytes packets and DUT executes the following command:
arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4 -w 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1 -w 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe -- --burst=64 --mbcache=512 -i --nb-cores=1 --rxq=1 --txq=1 --txd=256 --rxd=256 --auto-start --rss-udp
Without a patch I'm getting 3.2mpps, and only 2.9mpps when the patch is applied.
Regards,
Alex
<snip>
> > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi- packet RQ buffer refcnt
> > >
> > > Hi Phil Yang, we noticed that this patch gives us 10% of performance
> > > degradation on ARM.
> > > x86 seems to be unaffected though. Do you know what may be the
> > > reason of this behavior?
> >
> > Hi Alexander,
> >
> > Thanks for your feedback.
> > This patch removed some expensive memory barriers on aarch64, it
> > should get better performance.
> > I am not sure the root cause of this degradation now, I will start the
> > investigation. We can profiling this issue together.
> > Could you share your test case(including your testbed configuration) with
> us?
> >
> > Thanks,
> > Phil
>
> I'm surprised too, Phil, but looks like it is actually making things worse. I used
> Connect-X 6DX on aarch64:
> Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> sends 60 bytes packets and DUT executes the following command:
> arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4 -w
> 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1 -w
> 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe -- --burst=64 --
> mbcache=512 -i --nb-cores=1 --rxq=1 --txq=1 --txd=256 --rxd=256 --auto-
> start --rss-udp Without a patch I'm getting 3.2mpps, and only 2.9mpps when
> the patch is applied.
You are running on A72 cores, is that correct?
>
> Regards,
> Alex
<snip>
>
> > > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi- packet RQ buffer refcnt
> > > >
> > > > Hi Phil Yang, we noticed that this patch gives us 10% of
> > > > performance degradation on ARM.
> > > > x86 seems to be unaffected though. Do you know what may be the
> > > > reason of this behavior?
> > >
> > > Hi Alexander,
> > >
> > > Thanks for your feedback.
> > > This patch removed some expensive memory barriers on aarch64, it
> > > should get better performance.
> > > I am not sure the root cause of this degradation now, I will start
> > > the investigation. We can profiling this issue together.
> > > Could you share your test case(including your testbed configuration)
> > > with
> > us?
> > >
> > > Thanks,
> > > Phil
> >
> > I'm surprised too, Phil, but looks like it is actually making things
> > worse. I used Connect-X 6DX on aarch64:
> > Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> > 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> > sends 60 bytes packets and DUT executes the following command:
> > arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4 -w
> > 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1 -w
> > 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe -- --burst=64 --
> > mbcache=512 -i --nb-cores=1 --rxq=1 --txq=1 --txd=256 --rxd=256
> > --auto- start --rss-udp Without a patch I'm getting 3.2mpps, and only
> > 2.9mpps when the patch is applied.
> You are running on A72 cores, is that correct?
Correct, cat /proc/cpuinfo
processor : 0
BogoMIPS : 312.50
Features : fp asimd evtstrm crc32 cpuid
CPU implementer : 0x41
CPU architecture: 8
CPU variant : 0x0
CPU part : 0xd08
CPU revision : 3
Alexander Kozyrev <akozyrev@mellanox.com> writes:
<snip>
> >
> > > > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > > multi- packet RQ buffer refcnt
> > > > >
> > > > > Hi Phil Yang, we noticed that this patch gives us 10% of
> > > > > performance degradation on ARM.
> > > > > x86 seems to be unaffected though. Do you know what may be the
> > > > > reason of this behavior?
> > > >
> > > > Hi Alexander,
> > > >
> > > > Thanks for your feedback.
> > > > This patch removed some expensive memory barriers on aarch64, it
> > > > should get better performance.
> > > > I am not sure the root cause of this degradation now, I will start
> > > > the investigation. We can profiling this issue together.
> > > > Could you share your test case(including your testbed configuration)
> > > > with
> > > us?
<...>
> > >
> > > I'm surprised too, Phil, but looks like it is actually making things
> > > worse. I used Connect-X 6DX on aarch64:
> > > Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> > > 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> > > sends 60 bytes packets and DUT executes the following command:
> > > arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4 -w
> > > 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1 -w
> > > 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe -- --burst=64 --
> > > mbcache=512 -i --nb-cores=1 --rxq=1 --txq=1 --txd=256 --rxd=256
> > > --auto- start --rss-udp Without a patch I'm getting 3.2mpps, and only
> > > 2.9mpps when the patch is applied.
> > You are running on A72 cores, is that correct?
>
> Correct, cat /proc/cpuinfo
> processor : 0
> BogoMIPS : 312.50
> Features : fp asimd evtstrm crc32 cpuid
> CPU implementer : 0x41
> CPU architecture: 8
> CPU variant : 0x0
> CPU part : 0xd08
> CPU revision : 3
Thanks a lot for your input, Alex.
With your test command line, I remeasured this patch on two different aarch64 machines and both got some performance improvement.
SOC#1. On Thunderx2 (with LSE support), I see 7.6% performance improvement on throughput.
NIC: ConnectX-6 / driver: mlx5_core version: 5.0-1.0.0.0 / firmware-version: 20.27.1016 (MT_0000000224)
SOC#2. On N1SDP (I disabled LSE to generate A72 likewise instructions), I also see slightly (about 1%~2%) performance improvement on throughput.
NIC: ConnectX-5 / driver: mlx5_core / version: 5.0-2.1.8 / firmware-version: 16.27.2008 (MT_0000000090)
Without LSE (i.e. A72 and SOC#2 case.) it uses the 'Exclusive' mechanism to achieve atomicity.
For example, it generates below instructions for __atomic_add_fetch.
__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_ACQUIRE);
70118: f94037e3 ldr x3, [sp, #104]
7011c: 91002060 add x0, x3, #0x8
70120: 485ffc02 ldaxrh w2, [x0]
70124: 11000442 add w2, w2, #0x1
70128: 48057c02 stxrh w5, w2, [x0]
7012c: 35ffffa5 cbnz w5, 70120 <mlx5_rx_burst_mprq+0xb48>
In general, I think this patch will not lead to a sharp decline in performance.
Maybe you can try other testbeds?
Thanks,
Phil
Hi Alexander,
Thank you for testing this patch. Few comments below.
<snip>
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi-packet RQ buffer refcnt
> >
> > Hi,
> >
> > We are also doing C11 atomics converting for other components.
> > Your insight would be much appreciated.
> >
> > Thanks,
> > Phil Yang
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > To: dev@dpdk.org
> > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > <nd@arm.com>
> > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > which enforce unnecessary barriers on aarch64.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > ---
> > > v3:
> > > Split from the patchset:
> > > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpat
> > > ch
> > >
> >
> work.dpdk.org%2Fcover%2F68159%2F&data=02%7C01%7Cakozyrev%40m
> > ellano
> > >
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d1
> > 49256
> > >
> >
> f461b%7C0%7C0%7C637302061620808255&sdata=mRXbgPi6HyrVtP04Vl7
> > Bx8lD0
> > > trVP7noQlpOD7gBoTQ%3D&reserved=0
> > >
> > > drivers/net/mlx5/mlx5_rxq.c | 2 +- drivers/net/mlx5/mlx5_rxtx.c
> > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> > > 3 files changed, 11 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> > > void *opaque_arg,
> > >
> > > memset(_m, 0, sizeof(*buf));
> > > buf->mp = mp;
> > > - rte_atomic16_set(&buf->refcnt, 1);
> > > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > for (j = 0; j != strd_n; ++j) {
> > > shinfo = &buf->shinfos[j];
> > > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > e4106bf..f0eda88 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > __rte_unused,
> > > void *opaque) {
> > > struct mlx5_mprq_buf *buf = opaque;
> > >
> > > - if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > rte_mempool_put(buf->mp, buf);
> > > - } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > - rte_atomic16_set(&buf->refcnt, 1);
> > > + } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > + __ATOMIC_RELAXED) == 0)) {
> > > + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > rte_mempool_put(buf->mp, buf);
> > > }
> > > }
> > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >
> > > if (consumed_strd == strd_n) {
> > > /* Replace WQE only if the buffer is still in use. */
> > > - if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > + if (__atomic_load_n(&buf->refcnt,
> > > + __ATOMIC_RELAXED) > 1) {
> > > mprq_buf_replace(rxq, rq_ci & wq_mask,
> > strd_n);
> > > /* Release the old buffer. */
> > > mlx5_mprq_buf_free(buf);
> > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > > void *buf_addr;
> > >
> > > /* Increment the refcnt of the whole chunk. */
> > > - rte_atomic16_add_return(&buf->refcnt, 1);
rte_atomic16_add_return includes a full barrier along with atomic operation. But is full barrier required here? For ex: __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be enough?
> > > - MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > >refcnt) <=
> > > - strd_n + 1);
> > > + __atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_ACQUIRE);
Can you replace just the above line with the following lines and test it?
__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
__atomic_thread_fence(__ATOMIC_ACQ_REL);
This should make the generated code same as before this patch. Let me know if you would prefer us to re-spin the patch instead (for testing).
> > > + MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > > /*
> > > * MLX5 device doesn't use iova but it is necessary in a
> > diff
> > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > index 26621ff..0fc15f3 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > /* Multi-Packet RQ buffer header. */ struct mlx5_mprq_buf {
> > > struct rte_mempool *mp;
> > > - rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > + uint16_t refcnt; /* Atomically accessed refcnt. */
> > > uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > packet.
> > > */
> > > struct rte_mbuf_ext_shared_info shinfos[];
> > > /*
> > > --
> > > 2.7.4
<snip>
>
> > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Hi,
> > >
> > > We are also doing C11 atomics converting for other components.
> > > Your insight would be much appreciated.
> > >
> > > Thanks,
> > > Phil Yang
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > > To: dev@dpdk.org
> > > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > > <nd@arm.com>
> > > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi-packet RQ buffer refcnt
> > > >
> > > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > > which enforce unnecessary barriers on aarch64.
> > > >
> > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > ---
<...>
> > > >
> > > > drivers/net/mlx5/mlx5_rxq.c | 2 +- drivers/net/mlx5/mlx5_rxtx.c
> > > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> > > > 3 files changed, 11 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool
> *mp,
> > > > void *opaque_arg,
> > > >
> > > > memset(_m, 0, sizeof(*buf));
> > > > buf->mp = mp;
> > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > for (j = 0; j != strd_n; ++j) {
> > > > shinfo = &buf->shinfos[j];
> > > > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > > e4106bf..f0eda88 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > > __rte_unused,
> > > > void *opaque) {
> > > > struct mlx5_mprq_buf *buf = opaque;
> > > >
> > > > -if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > > +if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > > rte_mempool_put(buf->mp, buf);
> > > > -} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > +} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > > + __ATOMIC_RELAXED) == 0)) {
> > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > rte_mempool_put(buf->mp, buf);
> > > > }
> > > > }
> > > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > >
> > > > if (consumed_strd == strd_n) {
> > > > /* Replace WQE only if the buffer is still in use. */
> > > > -if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > > +if (__atomic_load_n(&buf->refcnt,
> > > > + __ATOMIC_RELAXED) > 1) {
> > > > mprq_buf_replace(rxq, rq_ci & wq_mask,
> > > strd_n);
> > > > /* Release the old buffer. */
> > > > mlx5_mprq_buf_free(buf);
> > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > > void *buf_addr;
> > > >
> > > > /* Increment the refcnt of the whole chunk. */
> > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> rte_atomic16_add_return includes a full barrier along with atomic operation.
> But is full barrier required here? For ex: __atomic_add_fetch(&buf->refcnt, 1,
> __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> enough?
>
> > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > >refcnt) <=
> > > > - strd_n + 1);
> > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > __ATOMIC_ACQUIRE);
The atomic load in MLX5_ASSERT() accesses the same memory space as the previous __atomic_add_fetch() does.
They will access this memory space in the program order when we enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch() becomes unnecessary.
By changing it to RELAXED ordering, this patch got 7.6% performance improvement on N1 (making it generate A72 alike instructions).
Could you please also try it on your testbed, Alex?
>
> Can you replace just the above line with the following lines and test it?
>
> __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> __atomic_thread_fence(__ATOMIC_ACQ_REL);
>
> This should make the generated code same as before this patch. Let me
> know if you would prefer us to re-spin the patch instead (for testing).
>
> > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > buf_addr = RTE_PTR_SUB(addr,
> > > > RTE_PKTMBUF_HEADROOM);
> > > > /*
> > > > * MLX5 device doesn't use iova but it is necessary in a
> > > diff
> > > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > > index 26621ff..0fc15f3 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > > /* Multi-Packet RQ buffer header. */ struct mlx5_mprq_buf {
> > > > struct rte_mempool *mp;
> > > > -rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > > +uint16_t refcnt; /* Atomically accessed refcnt. */
> > > > uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > > packet.
> > > > */
> > > > struct rte_mbuf_ext_shared_info shinfos[];
> > > > /*
> > > > --
> > > > 2.7.4
>
> <snip>
> >
> > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi-packet RQ buffer refcnt
> > > >
> > > > Hi,
> > > >
> > > > We are also doing C11 atomics converting for other components.
> > > > Your insight would be much appreciated.
> > > >
> > > > Thanks,
> > > > Phil Yang
> > > >
> > > > > -----Original Message-----
> > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > > > To: dev@dpdk.org
> > > > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > > > <nd@arm.com>
> > > > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > > multi-packet RQ buffer refcnt
> > > > >
> > > > > Use c11 atomics with explicit ordering instead of the rte_atomic
> > > > > ops which enforce unnecessary barriers on aarch64.
> > > > >
> > > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > > ---
> <...>
> > > > >
> > > > > drivers/net/mlx5/mlx5_rxq.c | 2 +-
> > > > > drivers/net/mlx5/mlx5_rxtx.c
> > > > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h | 2 +-
> > > > > 3 files changed, 11 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool
> > *mp,
> > > > > void *opaque_arg,
> > > > >
> > > > > memset(_m, 0, sizeof(*buf));
> > > > > buf->mp = mp;
> > > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > for (j = 0; j != strd_n; ++j) { shinfo = &buf->shinfos[j];
> > > > > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> > > > > index
> > > > > e4106bf..f0eda88 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > > > __rte_unused,
> > > > > void *opaque) {
> > > > > struct mlx5_mprq_buf *buf = opaque;
> > > > >
> > > > > -if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > > > +if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > > > rte_mempool_put(buf->mp, buf);
> > > > > -} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > > +} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > > > + __ATOMIC_RELAXED) == 0)) {
> > > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > rte_mempool_put(buf->mp, buf);
> > > > > }
> > > > > }
> > > > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> struct
> > > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > > >
> > > > > if (consumed_strd == strd_n) {
> > > > > /* Replace WQE only if the buffer is still in use. */ -if
> > > > > (rte_atomic16_read(&buf->refcnt) > 1) {
> > > > > +if (__atomic_load_n(&buf->refcnt,
> > > > > + __ATOMIC_RELAXED) > 1) {
> > > > > mprq_buf_replace(rxq, rq_ci & wq_mask,
> > > > strd_n);
> > > > > /* Release the old buffer. */
> > > > > mlx5_mprq_buf_free(buf);
> > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> struct
> > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > >
> > > > > /* Increment the refcnt of the whole chunk. */
> > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > rte_atomic16_add_return includes a full barrier along with atomic
> operation.
> > But is full barrier required here? For ex:
> > __atomic_add_fetch(&buf->refcnt, 1,
> > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> > enough?
> >
> > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > >refcnt) <=
> > > > > - strd_n + 1);
> > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > __ATOMIC_ACQUIRE);
>
> The atomic load in MLX5_ASSERT() accesses the same memory space as the
> previous __atomic_add_fetch() does.
> They will access this memory space in the program order when we enabled
> MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch()
> becomes unnecessary.
>
> By changing it to RELAXED ordering, this patch got 7.6% performance
> improvement on N1 (making it generate A72 alike instructions).
>
> Could you please also try it on your testbed, Alex?
Situation got better with this modification, here are the results:
- no patch: 3.0 Mpps CPU cycles/packet=51.52
- original patch: 2.1 Mpps CPU cycles/packet=71.05
- modified patch: 2.9 Mpps CPU cycles/packet=52.79
Also, I found that the degradation is there only in case I enable bursts stats.
Could you please turn on the following config options and see if you can reproduce this as well?
CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> >
> > Can you replace just the above line with the following lines and test it?
> >
> > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> >
> > This should make the generated code same as before this patch. Let me
> > know if you would prefer us to re-spin the patch instead (for testing).
> >
> > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > buf_addr = RTE_PTR_SUB(addr,
> > > > > RTE_PKTMBUF_HEADROOM);
> > > > > /*
> > > > > * MLX5 device doesn't use iova but it is necessary in a
> > > > diff
> > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > > > /* Multi-Packet RQ buffer header. */ struct mlx5_mprq_buf {
> > > > > struct rte_mempool *mp; -rte_atomic16_t refcnt; /* Atomically
> > > > > accessed refcnt. */
> > > > > +uint16_t refcnt; /* Atomically accessed refcnt. */
> > > > > uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > > > packet.
> > > > > */
> > > > > struct rte_mbuf_ext_shared_info shinfos[];
> > > > > /*
> > > > > --
> > > > > 2.7.4
> >
Alexander Kozyrev <akozyrev@mellanox.com> writes:
<snip>
> > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > struct
> > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > >
> > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > rte_atomic16_add_return includes a full barrier along with atomic
> > operation.
> > > But is full barrier required here? For ex:
> > > __atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> > > enough?
> > >
> > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > >refcnt) <=
> > > > > > - strd_n + 1);
> > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > __ATOMIC_ACQUIRE);
> >
> > The atomic load in MLX5_ASSERT() accesses the same memory space as the
> > previous __atomic_add_fetch() does.
> > They will access this memory space in the program order when we enabled
> > MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch()
> > becomes unnecessary.
> >
> > By changing it to RELAXED ordering, this patch got 7.6% performance
> > improvement on N1 (making it generate A72 alike instructions).
> >
> > Could you please also try it on your testbed, Alex?
>
> Situation got better with this modification, here are the results:
> - no patch: 3.0 Mpps CPU cycles/packet=51.52
> - original patch: 2.1 Mpps CPU cycles/packet=71.05
> - modified patch: 2.9 Mpps CPU cycles/packet=52.79
> Also, I found that the degradation is there only in case I enable bursts stats.
Great! So this patch will not hurt the normal datapath performance.
> Could you please turn on the following config options and see if you can
> reproduce this as well?
> CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
Thanks, Alex. Some updates.
Slightly (about 1%) throughput degradation was detected after we enabled these two config options on N1 SoC.
If we look insight the perf stats results, with this patch, both mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the original code.
However, __memcpy_generic takes more cycles. I think that might be the reason for CPU cycles per packet increment after applying this patch.
Original code:
98.07%--pkt_burst_io_forward
|
|--44.53%--__memcpy_generic
|
|--35.85%--mlx5_rx_burst_mprq
|
|--15.94%--mlx5_tx_burst_none_empw
| |
| |--7.32%--mlx5_tx_handle_completion.isra.0
| |
| --0.50%--__memcpy_generic
|
--1.14%--memcpy@plt
Use C11 with RELAXED ordering:
99.36%--pkt_burst_io_forward
|
|--47.40%--__memcpy_generic
|
|--34.62%--mlx5_rx_burst_mprq
|
|--15.55%--mlx5_tx_burst_none_empw
| |
| --7.08%--mlx5_tx_handle_completion.isra.0
|
--1.17%--memcpy@plt
BTW, all the atomic operations in this patch are not the hotspot.
>
> > >
> > > Can you replace just the above line with the following lines and test it?
> > >
> > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > >
> > > This should make the generated code same as before this patch. Let me
> > > know if you would prefer us to re-spin the patch instead (for testing).
> > >
> > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > buf_addr = RTE_PTR_SUB(addr,
> > > > > > RTE_PKTMBUF_HEADROOM);
> > > > > > /*
> > > > > > * MLX5 device doesn't use iova but it is necessary in a
> > > > > diff
> > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
<snip>
> > >
> Phil Yang <Phil.Yang@arm.com> writes:
>
> <snip>
>
> > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > > struct
> > > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > > >
> > > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > rte_atomic16_add_return includes a full barrier along with atomic
> > > operation.
> > > > But is full barrier required here? For ex:
> > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that
> > > > be enough?
> > > >
> > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > >refcnt) <=
> > > > > > > - strd_n + 1);
> > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > __ATOMIC_ACQUIRE);
> > >
> > > The atomic load in MLX5_ASSERT() accesses the same memory space as
> > > the previous __atomic_add_fetch() does.
> > > They will access this memory space in the program order when we
> > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > __atomic_add_fetch() becomes unnecessary.
> > >
> > > By changing it to RELAXED ordering, this patch got 7.6% performance
> > > improvement on N1 (making it generate A72 alike instructions).
> > >
> > > Could you please also try it on your testbed, Alex?
> >
> > Situation got better with this modification, here are the results:
> > - no patch: 3.0 Mpps CPU cycles/packet=51.52
> > - original patch: 2.1 Mpps CPU cycles/packet=71.05
> > - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found that
> > the degradation is there only in case I enable bursts stats.
>
>
> Great! So this patch will not hurt the normal datapath performance.
>
>
> > Could you please turn on the following config options and see if you
> > can reproduce this as well?
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
>
> Thanks, Alex. Some updates.
>
> Slightly (about 1%) throughput degradation was detected after we enabled
> these two config options on N1 SoC.
>
> If we look insight the perf stats results, with this patch, both mlx5_rx_burst
> and mlx5_tx_burst consume fewer CPU cycles than the original code.
> However, __memcpy_generic takes more cycles. I think that might be the
> reason for CPU cycles per packet increment after applying this patch.
>
> Original code:
> 98.07%--pkt_burst_io_forward
> |
> |--44.53%--__memcpy_generic
> |
> |--35.85%--mlx5_rx_burst_mprq
> |
> |--15.94%--mlx5_tx_burst_none_empw
> | |
> | |--7.32%--mlx5_tx_handle_completion.isra.0
> | |
> | --0.50%--__memcpy_generic
> |
> --1.14%--memcpy@plt
>
> Use C11 with RELAXED ordering:
> 99.36%--pkt_burst_io_forward
> |
> |--47.40%--__memcpy_generic
> |
> |--34.62%--mlx5_rx_burst_mprq
> |
> |--15.55%--mlx5_tx_burst_none_empw
> | |
> | --7.08%--mlx5_tx_handle_completion.isra.0
> |
> --1.17%--memcpy@plt
>
> BTW, all the atomic operations in this patch are not the hotspot.
Phil, we are seeing much worse degradation on our ARM platform unfortunately.
I don't think that discrepancy in memcpy can explain this behavior.
Your patch is not touching this area of code. Let me collect some perf stat on our side.
>
> >
> > > >
> > > > Can you replace just the above line with the following lines and test it?
> > > >
> > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > >
> > > > This should make the generated code same as before this patch. Let
> > > > me know if you would prefer us to re-spin the patch instead (for
> testing).
> > > >
> > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
> > > > > > > /*
> > > > > > > * MLX5 device doesn't use iova but it is necessary in a
> > > > > > diff
> > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> <snip>
> > > >
<snip>
> >
> > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > > > struct
> > > > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > > > >
> > > > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > atomic
> > > > operation.
> > > > > But is full barrier required here? For ex:
> > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > that be enough?
> > > > >
> > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > >refcnt) <=
> > > > > > > > - strd_n + 1);
> > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > __ATOMIC_ACQUIRE);
> > > >
> > > > The atomic load in MLX5_ASSERT() accesses the same memory space as
> > > > the previous __atomic_add_fetch() does.
> > > > They will access this memory space in the program order when we
> > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > __atomic_add_fetch() becomes unnecessary.
> > > >
> > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > performance improvement on N1 (making it generate A72 alike
> instructions).
> > > >
> > > > Could you please also try it on your testbed, Alex?
> > >
> > > Situation got better with this modification, here are the results:
> > > - no patch: 3.0 Mpps CPU cycles/packet=51.52
> > > - original patch: 2.1 Mpps CPU cycles/packet=71.05
> > > - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > that the degradation is there only in case I enable bursts stats.
> >
> >
> > Great! So this patch will not hurt the normal datapath performance.
> >
> >
> > > Could you please turn on the following config options and see if you
> > > can reproduce this as well?
> > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> >
> > Thanks, Alex. Some updates.
> >
> > Slightly (about 1%) throughput degradation was detected after we
> > enabled these two config options on N1 SoC.
> >
> > If we look insight the perf stats results, with this patch, both
> > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> original code.
> > However, __memcpy_generic takes more cycles. I think that might be the
> > reason for CPU cycles per packet increment after applying this patch.
> >
> > Original code:
> > 98.07%--pkt_burst_io_forward
> > |
> > |--44.53%--__memcpy_generic
> > |
> > |--35.85%--mlx5_rx_burst_mprq
> > |
> > |--15.94%--mlx5_tx_burst_none_empw
> > | |
> > | |--7.32%--mlx5_tx_handle_completion.isra.0
> > | |
> > | --0.50%--__memcpy_generic
> > |
> > --1.14%--memcpy@plt
> >
> > Use C11 with RELAXED ordering:
> > 99.36%--pkt_burst_io_forward
> > |
> > |--47.40%--__memcpy_generic
> > |
> > |--34.62%--mlx5_rx_burst_mprq
> > |
> > |--15.55%--mlx5_tx_burst_none_empw
> > | |
> > | --7.08%--mlx5_tx_handle_completion.isra.0
> > |
> > --1.17%--memcpy@plt
> >
> > BTW, all the atomic operations in this patch are not the hotspot.
>
> Phil, we are seeing much worse degradation on our ARM platform
> unfortunately.
> I don't think that discrepancy in memcpy can explain this behavior.
> Your patch is not touching this area of code. Let me collect some perf stat on
> our side.
Are you testing the patch as is or have you made the changes that were discussed in the thread?
>
> >
> > >
> > > > >
> > > > > Can you replace just the above line with the following lines and test it?
> > > > >
> > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > >
> > > > > This should make the generated code same as before this patch.
> > > > > Let me know if you would prefer us to re-spin the patch instead
> > > > > (for
> > testing).
> > > > >
> > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
> > > > > > > > /*
> > > > > > > > * MLX5 device doesn't use iova but it is necessary in a
> > > > > > > diff
> > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > 100644
> > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > <snip>
> > > > >
> <snip>
>
> > >
> > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> *dpdk_rxq,
> > > > > struct
> > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > > > > >
> > > > > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > atomic
> > > > > operation.
> > > > > > But is full barrier required here? For ex:
> > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > > that be enough?
> > > > > >
> > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > >refcnt) <=
> > > > > > > > > - strd_n + 1);
> > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > __ATOMIC_ACQUIRE);
> > > > >
> > > > > The atomic load in MLX5_ASSERT() accesses the same memory space
> > > > > as the previous __atomic_add_fetch() does.
> > > > > They will access this memory space in the program order when we
> > > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > __atomic_add_fetch() becomes unnecessary.
> > > > >
> > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > performance improvement on N1 (making it generate A72 alike
> > instructions).
> > > > >
> > > > > Could you please also try it on your testbed, Alex?
> > > >
> > > > Situation got better with this modification, here are the results:
> > > > - no patch: 3.0 Mpps CPU cycles/packet=51.52
> > > > - original patch: 2.1 Mpps CPU cycles/packet=71.05
> > > > - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > > that the degradation is there only in case I enable bursts stats.
> > >
> > >
> > > Great! So this patch will not hurt the normal datapath performance.
> > >
> > >
> > > > Could you please turn on the following config options and see if
> > > > you can reproduce this as well?
> > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > >
> > > Thanks, Alex. Some updates.
> > >
> > > Slightly (about 1%) throughput degradation was detected after we
> > > enabled these two config options on N1 SoC.
> > >
> > > If we look insight the perf stats results, with this patch, both
> > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> > original code.
> > > However, __memcpy_generic takes more cycles. I think that might be
> > > the reason for CPU cycles per packet increment after applying this patch.
> > >
> > > Original code:
> > > 98.07%--pkt_burst_io_forward
> > > |
> > > |--44.53%--__memcpy_generic
> > > |
> > > |--35.85%--mlx5_rx_burst_mprq
> > > |
> > > |--15.94%--mlx5_tx_burst_none_empw
> > > | |
> > > | |--7.32%--mlx5_tx_handle_completion.isra.0
> > > | |
> > > | --0.50%--__memcpy_generic
> > > |
> > > --1.14%--memcpy@plt
> > >
> > > Use C11 with RELAXED ordering:
> > > 99.36%--pkt_burst_io_forward
> > > |
> > > |--47.40%--__memcpy_generic
> > > |
> > > |--34.62%--mlx5_rx_burst_mprq
> > > |
> > > |--15.55%--mlx5_tx_burst_none_empw
> > > | |
> > > | --7.08%--mlx5_tx_handle_completion.isra.0
> > > |
> > > --1.17%--memcpy@plt
> > >
> > > BTW, all the atomic operations in this patch are not the hotspot.
> >
> > Phil, we are seeing much worse degradation on our ARM platform
> > unfortunately.
> > I don't think that discrepancy in memcpy can explain this behavior.
> > Your patch is not touching this area of code. Let me collect some perf
> > stat on our side.
> Are you testing the patch as is or have you made the changes that were
> discussed in the thread?
>
Yes, I made the changes you suggested. It really gets better with them.
Could you please respin the patch to make sure I got it right in my environment?
> >
> > >
> > > >
> > > > > >
> > > > > > Can you replace just the above line with the following lines and test
> it?
> > > > > >
> > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > >
> > > > > > This should make the generated code same as before this patch.
> > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > instead (for
> > > testing).
> > > > > >
> > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > > buf_addr = RTE_PTR_SUB(addr,
> RTE_PKTMBUF_HEADROOM);
> > > > > > > > > /*
> > > > > > > > > * MLX5 device doesn't use iova but it is necessary in
> > > > > > > > > a
> > > > > > > > diff
> > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > > 100644
> > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > <snip>
> > > > > >
<snip>
> >
> > > >
> > > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> > *dpdk_rxq,
> > > > > > struct
> > > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > > > > > >
> > > > > > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > > atomic
> > > > > > operation.
> > > > > > > But is full barrier required here? For ex:
> > > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > > > that be enough?
> > > > > > >
> > > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > > >refcnt) <=
> > > > > > > > > > - strd_n + 1);
> > > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > > __ATOMIC_ACQUIRE);
> > > > > >
> > > > > > The atomic load in MLX5_ASSERT() accesses the same memory
> space
> > > > > > as the previous __atomic_add_fetch() does.
> > > > > > They will access this memory space in the program order when we
> > > > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > > __atomic_add_fetch() becomes unnecessary.
> > > > > >
> > > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > > performance improvement on N1 (making it generate A72 alike
> > > instructions).
> > > > > >
> > > > > > Could you please also try it on your testbed, Alex?
> > > > >
> > > > > Situation got better with this modification, here are the results:
> > > > > - no patch: 3.0 Mpps CPU cycles/packet=51.52
> > > > > - original patch: 2.1 Mpps CPU cycles/packet=71.05
> > > > > - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > > > that the degradation is there only in case I enable bursts stats.
> > > >
> > > >
> > > > Great! So this patch will not hurt the normal datapath performance.
> > > >
> > > >
> > > > > Could you please turn on the following config options and see if
> > > > > you can reproduce this as well?
> > > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > > >
> > > > Thanks, Alex. Some updates.
> > > >
> > > > Slightly (about 1%) throughput degradation was detected after we
> > > > enabled these two config options on N1 SoC.
> > > >
> > > > If we look insight the perf stats results, with this patch, both
> > > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> > > original code.
> > > > However, __memcpy_generic takes more cycles. I think that might be
> > > > the reason for CPU cycles per packet increment after applying this
> patch.
> > > >
> > > > Original code:
> > > > 98.07%--pkt_burst_io_forward
> > > > |
> > > > |--44.53%--__memcpy_generic
> > > > |
> > > > |--35.85%--mlx5_rx_burst_mprq
> > > > |
> > > > |--15.94%--mlx5_tx_burst_none_empw
> > > > | |
> > > > | |--7.32%--mlx5_tx_handle_completion.isra.0
> > > > | |
> > > > | --0.50%--__memcpy_generic
> > > > |
> > > > --1.14%--memcpy@plt
> > > >
> > > > Use C11 with RELAXED ordering:
> > > > 99.36%--pkt_burst_io_forward
> > > > |
> > > > |--47.40%--__memcpy_generic
> > > > |
> > > > |--34.62%--mlx5_rx_burst_mprq
> > > > |
> > > > |--15.55%--mlx5_tx_burst_none_empw
> > > > | |
> > > > | --7.08%--mlx5_tx_handle_completion.isra.0
> > > > |
> > > > --1.17%--memcpy@plt
> > > >
> > > > BTW, all the atomic operations in this patch are not the hotspot.
> > >
> > > Phil, we are seeing much worse degradation on our ARM platform
> > > unfortunately.
> > > I don't think that discrepancy in memcpy can explain this behavior.
> > > Your patch is not touching this area of code. Let me collect some perf
> > > stat on our side.
> > Are you testing the patch as is or have you made the changes that were
> > discussed in the thread?
> >
>
> Yes, I made the changes you suggested. It really gets better with them.
> Could you please respin the patch to make sure I got it right in my
> environment?
Thanks, Alex.
Please check the new version here.
http://patchwork.dpdk.org/patch/76335/
>
> > >
> > > >
> > > > >
> > > > > > >
> > > > > > > Can you replace just the above line with the following lines and
> test
> > it?
> > > > > > >
> > > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > > >
> > > > > > > This should make the generated code same as before this patch.
> > > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > > instead (for
> > > > testing).
> > > > > > >
> > > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > > > buf_addr = RTE_PTR_SUB(addr,
> > RTE_PKTMBUF_HEADROOM);
> > > > > > > > > > /*
> > > > > > > > > > * MLX5 device doesn't use iova but it is necessary in
> > > > > > > > > > a
> > > > > > > > > diff
> > > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > > > 100644
> > > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > <snip>
> > > > > > >
> <snip>
> > >
> > > > >
> > > > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> > > *dpdk_rxq,
> > > > > > > struct
> > > > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n) void *buf_addr;
> > > > > > > > > > >
> > > > > > > > > > > /* Increment the refcnt of the whole chunk. */
> > > > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > > > atomic
> > > > > > > operation.
> > > > > > > > But is full barrier required here? For ex:
> > > > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier.
> > > > > > > > Would that be enough?
> > > > > > > >
> > > > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > > > >refcnt) <=
> > > > > > > > > > > - strd_n + 1);
> > > > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > > > __ATOMIC_ACQUIRE);
> > > > > > >
> > > > > > > The atomic load in MLX5_ASSERT() accesses the same memory
> > space
> > > > > > > as the previous __atomic_add_fetch() does.
> > > > > > > They will access this memory space in the program order when
> > > > > > > we enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > > > __atomic_add_fetch() becomes unnecessary.
> > > > > > >
> > > > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > > > performance improvement on N1 (making it generate A72 alike
> > > > instructions).
> > > > > > >
> > > > > > > Could you please also try it on your testbed, Alex?
> > > > > >
> > > > > > Situation got better with this modification, here are the results:
> > > > > > - no patch: 3.0 Mpps CPU cycles/packet=51.52
> > > > > > - original patch: 2.1 Mpps CPU cycles/packet=71.05
> > > > > > - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I
> > > > > > found that the degradation is there only in case I enable bursts stats.
> > > > >
> > > > >
> > > > > Great! So this patch will not hurt the normal datapath performance.
> > > > >
> > > > >
> > > > > > Could you please turn on the following config options and see
> > > > > > if you can reproduce this as well?
> > > > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > > > >
> > > > > Thanks, Alex. Some updates.
> > > > >
> > > > > Slightly (about 1%) throughput degradation was detected after we
> > > > > enabled these two config options on N1 SoC.
> > > > >
> > > > > If we look insight the perf stats results, with this patch, both
> > > > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than
> > > > > the
> > > > original code.
> > > > > However, __memcpy_generic takes more cycles. I think that might
> > > > > be the reason for CPU cycles per packet increment after applying
> > > > > this
> > patch.
> > > > >
> > > > > Original code:
> > > > > 98.07%--pkt_burst_io_forward
> > > > > |
> > > > > |--44.53%--__memcpy_generic
> > > > > |
> > > > > |--35.85%--mlx5_rx_burst_mprq
> > > > > |
> > > > > |--15.94%--mlx5_tx_burst_none_empw
> > > > > | |
> > > > > | |--7.32%--mlx5_tx_handle_completion.isra.0
> > > > > | |
> > > > > | --0.50%--__memcpy_generic
> > > > > |
> > > > > --1.14%--memcpy@plt
> > > > >
> > > > > Use C11 with RELAXED ordering:
> > > > > 99.36%--pkt_burst_io_forward
> > > > > |
> > > > > |--47.40%--__memcpy_generic
> > > > > |
> > > > > |--34.62%--mlx5_rx_burst_mprq
> > > > > |
> > > > > |--15.55%--mlx5_tx_burst_none_empw
> > > > > | |
> > > > > | --7.08%--mlx5_tx_handle_completion.isra.0
> > > > > |
> > > > > --1.17%--memcpy@plt
> > > > >
> > > > > BTW, all the atomic operations in this patch are not the hotspot.
> > > >
> > > > Phil, we are seeing much worse degradation on our ARM platform
> > > > unfortunately.
> > > > I don't think that discrepancy in memcpy can explain this behavior.
> > > > Your patch is not touching this area of code. Let me collect some
> > > > perf stat on our side.
> > > Are you testing the patch as is or have you made the changes that
> > > were discussed in the thread?
> > >
> >
> > Yes, I made the changes you suggested. It really gets better with them.
> > Could you please respin the patch to make sure I got it right in my
> > environment?
>
> Thanks, Alex.
> Please check the new version here.
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwo
> rk.dpdk.org%2Fpatch%2F76335%2F&data=02%7C01%7Cakozyrev%40nvidi
> a.com%7C2486830050214bac8b9708d84fb4d9f9%7C43083d15727340c1b7db39
> efd9ccc17a%7C0%7C0%7C637346985463620568&sdata=WGw0JZPcbjosSiI
> UxJuQz3r2pZBYkz%2BIXSqlOXimZdc%3D&reserved=0
This patch is definitely better, do not see a degradation anymore, thank you.
Acked-by: Alexander Kozyrev <akozyrev@nvidia.com>
>
> >
> > > >
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > > Can you replace just the above line with the following
> > > > > > > > lines and
> > test
> > > it?
> > > > > > > >
> > > > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > > > >
> > > > > > > > This should make the generated code same as before this patch.
> > > > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > > > instead (for
> > > > > testing).
> > > > > > > >
> > > > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > > > + __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > > > > buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > > > > > > > > > > /*
> > > > > > > > > > > * MLX5 device doesn't use iova but it is necessary
> > > > > > > > > > > in a
> > > > > > > > > > diff
> > > > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index
> > > > > > > > > > > 26621ff..0fc15f3
> > > > > > > > > > > 100644
> > > > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > > <snip>
> > > > > > > >
<snip>
> > > > > Phil, we are seeing much worse degradation on our ARM platform
> > > > > unfortunately.
> > > > > I don't think that discrepancy in memcpy can explain this behavior.
> > > > > Your patch is not touching this area of code. Let me collect
> > > > > some perf stat on our side.
> > > > Are you testing the patch as is or have you made the changes that
> > > > were discussed in the thread?
> > > >
> > >
> > > Yes, I made the changes you suggested. It really gets better with them.
> > > Could you please respin the patch to make sure I got it right in my
> > > environment?
> >
> > Thanks, Alex.
> > Please check the new version here.
> >
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> > wo
> rk.dpdk.org%2Fpatch%2F76335%2F&data=02%7C01%7Cakozyrev%40nv
> idi
> >
> a.com%7C2486830050214bac8b9708d84fb4d9f9%7C43083d15727340c1b7d
> b39
> >
> efd9ccc17a%7C0%7C0%7C637346985463620568&sdata=WGw0JZPcbjos
> SiI
> > UxJuQz3r2pZBYkz%2BIXSqlOXimZdc%3D&reserved=0
>
> This patch is definitely better, do not see a degradation anymore, thank you.
>
> Acked-by: Alexander Kozyrev <akozyrev@nvidia.com>
Can you please add your Acked-by tag in v4 (Phil provided the link above)?
<snip>
@@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg,
memset(_m, 0, sizeof(*buf));
buf->mp = mp;
- rte_atomic16_set(&buf->refcnt, 1);
+ __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
for (j = 0; j != strd_n; ++j) {
shinfo = &buf->shinfos[j];
shinfo->free_cb = mlx5_mprq_buf_free_cb;
@@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
{
struct mlx5_mprq_buf *buf = opaque;
- if (rte_atomic16_read(&buf->refcnt) == 1) {
+ if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
rte_mempool_put(buf->mp, buf);
- } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
- rte_atomic16_set(&buf->refcnt, 1);
+ } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
+ __ATOMIC_RELAXED) == 0)) {
+ __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
rte_mempool_put(buf->mp, buf);
}
}
@@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (consumed_strd == strd_n) {
/* Replace WQE only if the buffer is still in use. */
- if (rte_atomic16_read(&buf->refcnt) > 1) {
+ if (__atomic_load_n(&buf->refcnt,
+ __ATOMIC_RELAXED) > 1) {
mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
/* Release the old buffer. */
mlx5_mprq_buf_free(buf);
@@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
void *buf_addr;
/* Increment the refcnt of the whole chunk. */
- rte_atomic16_add_return(&buf->refcnt, 1);
- MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
- strd_n + 1);
+ __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_ACQUIRE);
+ MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
+ __ATOMIC_RELAXED) <= strd_n + 1);
buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
/*
* MLX5 device doesn't use iova but it is necessary in a
@@ -78,7 +78,7 @@ struct rxq_zip {
/* Multi-Packet RQ buffer header. */
struct mlx5_mprq_buf {
struct rte_mempool *mp;
- rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
+ uint16_t refcnt; /* Atomically accessed refcnt. */
uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */
struct rte_mbuf_ext_shared_info shinfos[];
/*