mbuf: replace GCC marker extension with C11 anonymous unions

Message ID 1706657173-26166-2-git-send-email-roretzla@linux.microsoft.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mbuf: replace GCC marker extension with C11 anonymous unions |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation fail Compilation issues
ci/iol-broadcom-Performance success Performance Testing PASS
ci/intel-Testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-abi-testing warning Testing issues
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/github-robot: build fail github build: failed
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS

Commit Message

Tyler Retzlaff Jan. 30, 2024, 11:26 p.m. UTC
  Replace the use of RTE_MARKER<x> with C11 anonymous unions to improve
code portability between toolchains.

Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
net/virtio which were accessing field as a zero-length array.

Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
---
 drivers/net/ionic/ionic_lif.c               |   8 +-
 drivers/net/ionic/ionic_rxtx_sg.c           |   4 +-
 drivers/net/ionic/ionic_rxtx_simple.c       |   2 +-
 drivers/net/sfc/sfc_ef100_rx.c              |   8 +-
 drivers/net/sfc/sfc_ef10_rx.c               |  12 +--
 drivers/net/virtio/virtio_rxtx_packed_avx.h |   8 +-
 lib/mbuf/rte_mbuf_core.h                    | 135 +++++++++++++++-------------
 7 files changed, 94 insertions(+), 83 deletions(-)
  

Comments

Morten Brørup Jan. 31, 2024, 9:18 a.m. UTC | #1
> From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> Sent: Wednesday, 31 January 2024 00.26
> 
> Replace the use of RTE_MARKER<x> with C11 anonymous unions to improve
> code portability between toolchains.
> 
> Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
> net/virtio which were accessing field as a zero-length array.
> 
> Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> ---

I have some comments, putting weight on code readability rather than avoiding API breakage.

We can consider my suggested API breaking changes for the next API breaking release, and keep your goal of minimal API breakage with the current changes.

> diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> index 5688683..d731ea0 100644
> --- a/lib/mbuf/rte_mbuf_core.h
> +++ b/lib/mbuf/rte_mbuf_core.h
> @@ -464,9 +464,10 @@ enum {
>   * The generic rte_mbuf, containing a packet mbuf.
>   */
>  struct rte_mbuf {
> -	RTE_MARKER cacheline0;
> -
> -	void *buf_addr;           /**< Virtual address of segment buffer.
> */
> +	union {
> +	    void *cacheline0;
> +	    void *buf_addr;           /**< Virtual address of segment
> buffer. */
> +	};

I suppose this is the least ugly workaround for not being able to use the RTE_MARKER hack here.

>  #if RTE_IOVA_IN_MBUF
>  	/**
>  	 * Physical address of segment buffer.
> @@ -487,69 +488,77 @@ struct rte_mbuf {
>  #endif
> 
>  	/* next 8 bytes are initialised on RX descriptor rearm */
> -	RTE_MARKER64 rearm_data;
> -	uint16_t data_off;
> -
> -	/**
> -	 * Reference counter. Its size should at least equal to the size
> -	 * of port field (16 bits), to support zero-copy broadcast.
> -	 * It should only be accessed using the following functions:
> -	 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
> -	 * rte_mbuf_refcnt_set(). The functionality of these functions
> (atomic,
> -	 * or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC
> flag.
> -	 */
> -	RTE_ATOMIC(uint16_t) refcnt;
> +	union {
> +		uint64_t rearm_data;

I consider this union with uint64_t rearm_data an improvement for code readability. Using a marker here was weird.

> +		struct {
> +			uint16_t data_off;
> +
> +			/**
> +			 * Reference counter. Its size should at least equal
> to the size
> +			 * of port field (16 bits), to support zero-copy
> broadcast.
> +			 * It should only be accessed using the following
> functions:
> +			 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(),
> and
> +			 * rte_mbuf_refcnt_set(). The functionality of these
> functions (atomic,
> +			 * or non-atomic) is controlled by the
> RTE_MBUF_REFCNT_ATOMIC flag.
> +			 */
> +			RTE_ATOMIC(uint16_t) refcnt;
> 
> -	/**
> -	 * Number of segments. Only valid for the first segment of an
> mbuf
> -	 * chain.
> -	 */
> -	uint16_t nb_segs;
> +			/**
> +			 * Number of segments. Only valid for the first
> segment of an mbuf
> +			 * chain.
> +			 */
> +			uint16_t nb_segs;
> 
> -	/** Input port (16 bits to support more than 256 virtual ports).
> -	 * The event eth Tx adapter uses this field to specify the output
> port.
> -	 */
> -	uint16_t port;
> +			/** Input port (16 bits to support more than 256
> virtual ports).
> +			 * The event eth Tx adapter uses this field to
> specify the output port.
> +			 */
> +			uint16_t port;
> 
> -	uint64_t ol_flags;        /**< Offload features. */
> +			uint64_t ol_flags;        /**< Offload features. */

Either:
1. If the comment about 8 bytes init on rearm is correct: ol_flags should remain outside the struct and union, i.e. at top level, else
2. It would be nice to increase the size of the rearm_data variable to 16 byte, so it covers the entire struct being rearmed. (And the incorrect comment about how many bytes are being rearmed should be fixed.)

> +		};
> +	};
> 
>  	/* remaining bytes are set on RX when pulling packet from
> descriptor */
> -	RTE_MARKER rx_descriptor_fields1;
> -
> -	/*
> -	 * The packet type, which is the combination of outer/inner L2,
> L3, L4
> -	 * and tunnel types. The packet_type is about data really present
> in the
> -	 * mbuf. Example: if vlan stripping is enabled, a received vlan
> packet
> -	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN
> because the
> -	 * vlan is stripped from the data.
> -	 */
>  	union {
> -		uint32_t packet_type; /**< L2/L3/L4 and tunnel information.
> */
> -		__extension__
> +		void *rx_descriptor_fields1;

Instead of using void* for rx_descriptor_fields1, it would be nice to make rx_descriptor_fields1 a type of the correct size. It might need to be an uint32_t array to avoid imposing additional alignment requirements.

> +
> +		/*
> +		 * The packet type, which is the combination of outer/inner
> L2, L3, L4
> +		 * and tunnel types. The packet_type is about data really
> present in the
> +		 * mbuf. Example: if vlan stripping is enabled, a received
> vlan packet
> +		 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN
> because the
> +		 * vlan is stripped from the data.
> +		 */
>  		struct {
> -			uint8_t l2_type:4;   /**< (Outer) L2 type. */
> -			uint8_t l3_type:4;   /**< (Outer) L3 type. */
> -			uint8_t l4_type:4;   /**< (Outer) L4 type. */
> -			uint8_t tun_type:4;  /**< Tunnel type. */
>  			union {
> -				uint8_t inner_esp_next_proto;
> -				/**< ESP next protocol type, valid if
> -				 * RTE_PTYPE_TUNNEL_ESP tunnel type is set
> -				 * on both Tx and Rx.
> -				 */
> +				uint32_t packet_type; /**< L2/L3/L4 and tunnel
> information. */
>  				__extension__
>  				struct {
> -					uint8_t inner_l2_type:4;
> -					/**< Inner L2 type. */
> -					uint8_t inner_l3_type:4;
> -					/**< Inner L3 type. */
> +					uint8_t l2_type:4;   /**< (Outer) L2
> type. */
> +					uint8_t l3_type:4;   /**< (Outer) L3
> type. */
> +					uint8_t l4_type:4;   /**< (Outer) L4
> type. */
> +					uint8_t tun_type:4;  /**< Tunnel type. */
> +					union {
> +						uint8_t inner_esp_next_proto;
> +						/**< ESP next protocol type, valid
> if
> +						 * RTE_PTYPE_TUNNEL_ESP tunnel type
> is set
> +						 * on both Tx and Rx.
> +						 */
> +						__extension__
> +						struct {
> +							uint8_t inner_l2_type:4;
> +							/**< Inner L2 type. */
> +							uint8_t inner_l3_type:4;
> +							/**< Inner L3 type. */
> +						};
> +					};
> +					uint8_t inner_l4_type:4; /**< Inner L4
> type. */
>  				};
>  			};
> -			uint8_t inner_l4_type:4; /**< Inner L4 type. */
> +			uint32_t pkt_len;         /**< Total pkt len: sum of
> all segments. */
>  		};
>  	};
> 
> -	uint32_t pkt_len;         /**< Total pkt len: sum of all
> segments. */
>  	uint16_t data_len;        /**< Amount of data in segment buffer.
> */
>  	/** VLAN TCI (CPU order), valid if RTE_MBUF_F_RX_VLAN is set. */
>  	uint16_t vlan_tci;
> @@ -595,21 +604,23 @@ struct rte_mbuf {
>  	struct rte_mempool *pool; /**< Pool from which mbuf was
> allocated. */
> 
>  	/* second cache line - fields only used in slow path or on TX */
> -	RTE_MARKER cacheline1 __rte_cache_min_aligned;
> +	union {
> +		void *cacheline1;

The __rte_cache_min_aligned cannot be removed. It provides cache line alignment for 32 bit platforms, where pointers in the first cache line only use 4 byte.

NB: The rte_mbuf structure could be optimized for 32 bit platforms by moving fields from the second cache line to the holes in the first, but that's another discussion.

> 
>  #if RTE_IOVA_IN_MBUF
> -	/**
> -	 * Next segment of scattered packet. Must be NULL in the last
> -	 * segment or in case of non-segmented packet.
> -	 */
> -	struct rte_mbuf *next;
> +		/**
> +		 * Next segment of scattered packet. Must be NULL in the
> last
> +		 * segment or in case of non-segmented packet.
> +		 */
> +		struct rte_mbuf *next;
>  #else
> -	/**
> -	 * Reserved for dynamic fields
> -	 * when the next pointer is in first cache line (i.e.
> RTE_IOVA_IN_MBUF is 0).
> -	 */
> -	uint64_t dynfield2;
> +		/**
> +		 * Reserved for dynamic fields
> +		 * when the next pointer is in first cache line (i.e.
> RTE_IOVA_IN_MBUF is 0).
> +		 */
> +		uint64_t dynfield2;
>  #endif
> +	};
> 
>  	/* fields to support TX offloads */
>  	union {
> --
> 1.8.3.1
  
Bruce Richardson Jan. 31, 2024, 1:49 p.m. UTC | #2
On Tue, Jan 30, 2024 at 03:26:13PM -0800, Tyler Retzlaff wrote:
> Replace the use of RTE_MARKER<x> with C11 anonymous unions to improve
> code portability between toolchains.
> 
> Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
> net/virtio which were accessing field as a zero-length array.
> 
> Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> ---
>  drivers/net/ionic/ionic_lif.c               |   8 +-
>  drivers/net/ionic/ionic_rxtx_sg.c           |   4 +-
>  drivers/net/ionic/ionic_rxtx_simple.c       |   2 +-
>  drivers/net/sfc/sfc_ef100_rx.c              |   8 +-
>  drivers/net/sfc/sfc_ef10_rx.c               |  12 +--
>  drivers/net/virtio/virtio_rxtx_packed_avx.h |   8 +-
>  lib/mbuf/rte_mbuf_core.h                    | 135 +++++++++++++++-------------
>  7 files changed, 94 insertions(+), 83 deletions(-)
> 
<snip>
@@ -464,9 +464,10 @@ enum {
>   * The generic rte_mbuf, containing a packet mbuf.
>   */
>  struct rte_mbuf {
> -	RTE_MARKER cacheline0;
> -
> -	void *buf_addr;           /**< Virtual address of segment buffer. */
> +	union {
> +	    void *cacheline0;
> +	    void *buf_addr;           /**< Virtual address of segment buffer. */
> +	};

This marker is never used, so we should just look to drop it. I think it
was originally added to have an equivalent to the cacheline1 marker.

However, that would be an ABI change, so I'm ok to have this as-is for now.

/Bruce
  
Tyler Retzlaff Jan. 31, 2024, 8:45 p.m. UTC | #3
On Wed, Jan 31, 2024 at 01:49:34PM +0000, Bruce Richardson wrote:
> On Tue, Jan 30, 2024 at 03:26:13PM -0800, Tyler Retzlaff wrote:
> > Replace the use of RTE_MARKER<x> with C11 anonymous unions to improve
> > code portability between toolchains.
> > 
> > Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
> > net/virtio which were accessing field as a zero-length array.
> > 
> > Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> > ---
> >  drivers/net/ionic/ionic_lif.c               |   8 +-
> >  drivers/net/ionic/ionic_rxtx_sg.c           |   4 +-
> >  drivers/net/ionic/ionic_rxtx_simple.c       |   2 +-
> >  drivers/net/sfc/sfc_ef100_rx.c              |   8 +-
> >  drivers/net/sfc/sfc_ef10_rx.c               |  12 +--
> >  drivers/net/virtio/virtio_rxtx_packed_avx.h |   8 +-
> >  lib/mbuf/rte_mbuf_core.h                    | 135 +++++++++++++++-------------
> >  7 files changed, 94 insertions(+), 83 deletions(-)
> > 
> <snip>
> @@ -464,9 +464,10 @@ enum {
> >   * The generic rte_mbuf, containing a packet mbuf.
> >   */
> >  struct rte_mbuf {
> > -	RTE_MARKER cacheline0;
> > -
> > -	void *buf_addr;           /**< Virtual address of segment buffer. */
> > +	union {
> > +	    void *cacheline0;
> > +	    void *buf_addr;           /**< Virtual address of segment buffer. */
> > +	};
> 
> This marker is never used, so we should just look to drop it. I think it
> was originally added to have an equivalent to the cacheline1 marker.

it's actually got a use in one location.

rte_mbuf.h:

static inline void
rte_mbuf_prefetch_part1(struct rte_mbuf *m)
{
        rte_prefetch0(&m->cacheline0);
}

> However, that would be an ABI change, so I'm ok to have this as-is for now.

do you mean api change? just asking to make sure i understand what i'm
doing.

as i understand how this extension (marker) works removing the
cacheline0 marker would not alter the layout of the struct. that is the
sizeof the struct, sizeof any field nor the offset of any field changes
would change by the marker removal.

> 
> /Bruce
  
Tyler Retzlaff Jan. 31, 2024, 9:09 p.m. UTC | #4
On Wed, Jan 31, 2024 at 10:18:37AM +0100, Morten Brørup wrote:
> > From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> > Sent: Wednesday, 31 January 2024 00.26
> > 
> > Replace the use of RTE_MARKER<x> with C11 anonymous unions to improve
> > code portability between toolchains.
> > 
> > Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
> > net/virtio which were accessing field as a zero-length array.
> > 
> > Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> > ---
> 
> I have some comments, putting weight on code readability rather than avoiding API breakage.
> 
> We can consider my suggested API breaking changes for the next API breaking release, and keep your goal of minimal API breakage with the current changes.

thanks appreciate your help with this one.

> 
> > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> > index 5688683..d731ea0 100644
> > --- a/lib/mbuf/rte_mbuf_core.h
> > +++ b/lib/mbuf/rte_mbuf_core.h
> > @@ -464,9 +464,10 @@ enum {
> >   * The generic rte_mbuf, containing a packet mbuf.
> >   */
> >  struct rte_mbuf {
> > -	RTE_MARKER cacheline0;
> > -
> > -	void *buf_addr;           /**< Virtual address of segment buffer.
> > */
> > +	union {
> > +	    void *cacheline0;
> > +	    void *buf_addr;           /**< Virtual address of segment
> > buffer. */
> > +	};
> 
> I suppose this is the least ugly workaround for not being able to use the RTE_MARKER hack here.

it is but i'm absolutely open to alternatives that work with all
toolchains and both C and C++ if there are any.

> 
> >  #if RTE_IOVA_IN_MBUF
> >  	/**
> >  	 * Physical address of segment buffer.
> > @@ -487,69 +488,77 @@ struct rte_mbuf {
> >  #endif
> > 
> >  	/* next 8 bytes are initialised on RX descriptor rearm */
> > -	RTE_MARKER64 rearm_data;
> > -	uint16_t data_off;
> > -
> > -	/**
> > -	 * Reference counter. Its size should at least equal to the size
> > -	 * of port field (16 bits), to support zero-copy broadcast.
> > -	 * It should only be accessed using the following functions:
> > -	 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
> > -	 * rte_mbuf_refcnt_set(). The functionality of these functions
> > (atomic,
> > -	 * or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC
> > flag.
> > -	 */
> > -	RTE_ATOMIC(uint16_t) refcnt;
> > +	union {
> > +		uint64_t rearm_data;
> 
> I consider this union with uint64_t rearm_data an improvement for code readability. Using a marker here was weird.
> 
> > +		struct {
> > +			uint16_t data_off;
> > +
> > +			/**
> > +			 * Reference counter. Its size should at least equal
> > to the size
> > +			 * of port field (16 bits), to support zero-copy
> > broadcast.
> > +			 * It should only be accessed using the following
> > functions:
> > +			 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(),
> > and
> > +			 * rte_mbuf_refcnt_set(). The functionality of these
> > functions (atomic,
> > +			 * or non-atomic) is controlled by the
> > RTE_MBUF_REFCNT_ATOMIC flag.
> > +			 */
> > +			RTE_ATOMIC(uint16_t) refcnt;
> > 
> > -	/**
> > -	 * Number of segments. Only valid for the first segment of an
> > mbuf
> > -	 * chain.
> > -	 */
> > -	uint16_t nb_segs;
> > +			/**
> > +			 * Number of segments. Only valid for the first
> > segment of an mbuf
> > +			 * chain.
> > +			 */
> > +			uint16_t nb_segs;
> > 
> > -	/** Input port (16 bits to support more than 256 virtual ports).
> > -	 * The event eth Tx adapter uses this field to specify the output
> > port.
> > -	 */
> > -	uint16_t port;
> > +			/** Input port (16 bits to support more than 256
> > virtual ports).
> > +			 * The event eth Tx adapter uses this field to
> > specify the output port.
> > +			 */
> > +			uint16_t port;
> > 
> > -	uint64_t ol_flags;        /**< Offload features. */
> > +			uint64_t ol_flags;        /**< Offload features. */
> 
> Either:
> 1. If the comment about 8 bytes init on rearm is correct: ol_flags should remain outside the struct and union, i.e. at top level, else

> 2. It would be nice to increase the size of the rearm_data variable to 16 byte, so it covers the entire struct being rearmed. (And the incorrect comment about how many bytes are being rearmed should be fixed.)
> 

thanks for picking this up, i think i've actually just got a mistake
here. i don't think ol_flags should have been lifted into the union i'll
go back and do some double checking.

> > +		};
> > +	};
> > 
> >  	/* remaining bytes are set on RX when pulling packet from
> > descriptor */
> > -	RTE_MARKER rx_descriptor_fields1;
> > -
> > -	/*
> > -	 * The packet type, which is the combination of outer/inner L2,
> > L3, L4
> > -	 * and tunnel types. The packet_type is about data really present
> > in the
> > -	 * mbuf. Example: if vlan stripping is enabled, a received vlan
> > packet
> > -	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN
> > because the
> > -	 * vlan is stripped from the data.
> > -	 */
> >  	union {
> > -		uint32_t packet_type; /**< L2/L3/L4 and tunnel information.
> > */
> > -		__extension__
> > +		void *rx_descriptor_fields1;
> 
> Instead of using void* for rx_descriptor_fields1, it would be nice to make rx_descriptor_fields1 a type of the correct size. It might need to be an uint32_t array to avoid imposing additional alignment requirements.

as you've probably guessed i used the type from the original marker in
use. for api compat reasons i'll avoid changing type in this series.

> 
> > +
> > +		/*
> > +		 * The packet type, which is the combination of outer/inner
> > L2, L3, L4
> > +		 * and tunnel types. The packet_type is about data really
> > present in the
> > +		 * mbuf. Example: if vlan stripping is enabled, a received
> > vlan packet
> > +		 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN
> > because the
> > +		 * vlan is stripped from the data.
> > +		 */
> >  		struct {
> > -			uint8_t l2_type:4;   /**< (Outer) L2 type. */
> > -			uint8_t l3_type:4;   /**< (Outer) L3 type. */
> > -			uint8_t l4_type:4;   /**< (Outer) L4 type. */
> > -			uint8_t tun_type:4;  /**< Tunnel type. */
> >  			union {
> > -				uint8_t inner_esp_next_proto;
> > -				/**< ESP next protocol type, valid if
> > -				 * RTE_PTYPE_TUNNEL_ESP tunnel type is set
> > -				 * on both Tx and Rx.
> > -				 */
> > +				uint32_t packet_type; /**< L2/L3/L4 and tunnel
> > information. */
> >  				__extension__
> >  				struct {
> > -					uint8_t inner_l2_type:4;
> > -					/**< Inner L2 type. */
> > -					uint8_t inner_l3_type:4;
> > -					/**< Inner L3 type. */
> > +					uint8_t l2_type:4;   /**< (Outer) L2
> > type. */
> > +					uint8_t l3_type:4;   /**< (Outer) L3
> > type. */
> > +					uint8_t l4_type:4;   /**< (Outer) L4
> > type. */
> > +					uint8_t tun_type:4;  /**< Tunnel type. */
> > +					union {
> > +						uint8_t inner_esp_next_proto;
> > +						/**< ESP next protocol type, valid
> > if
> > +						 * RTE_PTYPE_TUNNEL_ESP tunnel type
> > is set
> > +						 * on both Tx and Rx.
> > +						 */
> > +						__extension__
> > +						struct {
> > +							uint8_t inner_l2_type:4;
> > +							/**< Inner L2 type. */
> > +							uint8_t inner_l3_type:4;
> > +							/**< Inner L3 type. */
> > +						};
> > +					};
> > +					uint8_t inner_l4_type:4; /**< Inner L4
> > type. */
> >  				};
> >  			};
> > -			uint8_t inner_l4_type:4; /**< Inner L4 type. */
> > +			uint32_t pkt_len;         /**< Total pkt len: sum of
> > all segments. */
> >  		};
> >  	};
> > 
> > -	uint32_t pkt_len;         /**< Total pkt len: sum of all
> > segments. */
> >  	uint16_t data_len;        /**< Amount of data in segment buffer.
> > */
> >  	/** VLAN TCI (CPU order), valid if RTE_MBUF_F_RX_VLAN is set. */
> >  	uint16_t vlan_tci;
> > @@ -595,21 +604,23 @@ struct rte_mbuf {
> >  	struct rte_mempool *pool; /**< Pool from which mbuf was
> > allocated. */
> > 
> >  	/* second cache line - fields only used in slow path or on TX */
> > -	RTE_MARKER cacheline1 __rte_cache_min_aligned;
> > +	union {
> > +		void *cacheline1;
> 
> The __rte_cache_min_aligned cannot be removed. It provides cache line alignment for 32 bit platforms, where pointers in the first cache line only use 4 byte.

oh no i forgot i needed to figure this out before submission. now that
it's here though i could use some help / suggestions.

the existing __rte_cache_min_aligned (and indeed standard alignas)
facilities are not of great utility when applied to anonymous unions,
further complicating things is that it also has to work with C++.

i'll take this away and work on it some more but does anyone here have a
suggestion on how to align this anonymous union data member to the
desired alignment *without* the union being padded to min cache line
size and as a consequence causing the rte_mbuf struct to be 3 instead
of 2 cache lines? (that's essentially the problem i need help solving).

> 
> NB: The rte_mbuf structure could be optimized for 32 bit platforms by moving fields from the second cache line to the holes in the first, but that's another discussion.

likely could be optimized. a discussion for another time since we can't make
breaking abi changes.

> 
> > 
> >  #if RTE_IOVA_IN_MBUF
> > -	/**
> > -	 * Next segment of scattered packet. Must be NULL in the last
> > -	 * segment or in case of non-segmented packet.
> > -	 */
> > -	struct rte_mbuf *next;
> > +		/**
> > +		 * Next segment of scattered packet. Must be NULL in the
> > last
> > +		 * segment or in case of non-segmented packet.
> > +		 */
> > +		struct rte_mbuf *next;
> >  #else
> > -	/**
> > -	 * Reserved for dynamic fields
> > -	 * when the next pointer is in first cache line (i.e.
> > RTE_IOVA_IN_MBUF is 0).
> > -	 */
> > -	uint64_t dynfield2;
> > +		/**
> > +		 * Reserved for dynamic fields
> > +		 * when the next pointer is in first cache line (i.e.
> > RTE_IOVA_IN_MBUF is 0).
> > +		 */
> > +		uint64_t dynfield2;
> >  #endif
> > +	};
> > 
> >  	/* fields to support TX offloads */
> >  	union {
> > --
> > 1.8.3.1
  
Morten Brørup Jan. 31, 2024, 10:39 p.m. UTC | #5
> From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> Sent: Wednesday, 31 January 2024 22.09
> 
> On Wed, Jan 31, 2024 at 10:18:37AM +0100, Morten Brørup wrote:
> > > From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> > > Sent: Wednesday, 31 January 2024 00.26
> > >

[...]

> > >  	struct rte_mempool *pool; /**< Pool from which mbuf was
> > > allocated. */
> > >
> > >  	/* second cache line - fields only used in slow path or on TX */
> > > -	RTE_MARKER cacheline1 __rte_cache_min_aligned;
> > > +	union {
> > > +		void *cacheline1;
> >
> > The __rte_cache_min_aligned cannot be removed. It provides cache line
> alignment for 32 bit platforms, where pointers in the first cache line
> only use 4 byte.
> 
> oh no i forgot i needed to figure this out before submission. now that
> it's here though i could use some help / suggestions.
> 
> the existing __rte_cache_min_aligned (and indeed standard alignas)
> facilities are not of great utility when applied to anonymous unions,
> further complicating things is that it also has to work with C++.
> 
> i'll take this away and work on it some more but does anyone here have
> a
> suggestion on how to align this anonymous union data member to the
> desired alignment *without* the union being padded to min cache line
> size and as a consequence causing the rte_mbuf struct to be 3 instead
> of 2 cache lines? (that's essentially the problem i need help solving).

I would suggest to simply remove __rte_cache_min_aligned (and the implicit padding that comes with it), and instead conditionally (#ifdef RTE_ARCH_32) add an explicit uintptr_t padding field after each pointer field in the rte_mbuf struct's first cache line.

But that would break the 32-bit ABI, so instead insert the explicit padding in the rte_mbuf struct at the end of its first cache line (where the implicit padding from __rte_cache_min_aligned is currently added), something like this:

	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */

+#ifdef RTE_ARCH_32
+	/* Padding to ensure correct alignment of cacheline1. */
+	uintptr_t pad_buf_addr;
+#if !RTE_IOVA_IN_MBUF
+	uintptr_t pad_next;
+#endif
+#endif /* RTE_ARCH_32 */

	/* second cache line - fields only used in slow path or on TX */

To be on the safe side, add a static_assert to verify that offsetof(struct rte_mbuf, cacheline1) == RTE_CACHE_LINE_MIN_SIZE. This should be true for all architectures, i.e. both 64 bit and 32 bit.
  
Morten Brørup Jan. 31, 2024, 10:55 p.m. UTC | #6
> From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> Sent: Wednesday, 31 January 2024 21.46
> 
> On Wed, Jan 31, 2024 at 01:49:34PM +0000, Bruce Richardson wrote:
> > On Tue, Jan 30, 2024 at 03:26:13PM -0800, Tyler Retzlaff wrote:
> > > Replace the use of RTE_MARKER<x> with C11 anonymous unions to
> improve
> > > code portability between toolchains.
> > >
> > > Update use of rte_mbuf rearm_data field in net/ionic, net/sfc and
> > > net/virtio which were accessing field as a zero-length array.
> > >
> > > Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> > > ---
> > >  drivers/net/ionic/ionic_lif.c               |   8 +-
> > >  drivers/net/ionic/ionic_rxtx_sg.c           |   4 +-
> > >  drivers/net/ionic/ionic_rxtx_simple.c       |   2 +-
> > >  drivers/net/sfc/sfc_ef100_rx.c              |   8 +-
> > >  drivers/net/sfc/sfc_ef10_rx.c               |  12 +--
> > >  drivers/net/virtio/virtio_rxtx_packed_avx.h |   8 +-
> > >  lib/mbuf/rte_mbuf_core.h                    | 135 +++++++++++++++-
> ------------
> > >  7 files changed, 94 insertions(+), 83 deletions(-)
> > >
> > <snip>
> > @@ -464,9 +464,10 @@ enum {
> > >   * The generic rte_mbuf, containing a packet mbuf.
> > >   */
> > >  struct rte_mbuf {
> > > -	RTE_MARKER cacheline0;
> > > -
> > > -	void *buf_addr;           /**< Virtual address of segment buffer.
> */
> > > +	union {
> > > +	    void *cacheline0;
> > > +	    void *buf_addr;           /**< Virtual address of segment
> buffer. */
> > > +	};
> >
> > This marker is never used, so we should just look to drop it. I think
> it
> > was originally added to have an equivalent to the cacheline1 marker.
> 
> it's actually got a use in one location.
> 
> rte_mbuf.h:
> 
> static inline void
> rte_mbuf_prefetch_part1(struct rte_mbuf *m)
> {	
>         rte_prefetch0(&m->cacheline0);
> }	
> 
> > However, that would be an ABI change, so I'm ok to have this as-is
> for now.

Typo: API change, not ABI change.

> 
> do you mean api change? just asking to make sure i understand what i'm
> doing.
> 
> as i understand how this extension (marker) works removing the
> cacheline0 marker would not alter the layout of the struct. that is the
> sizeof the struct, sizeof any field nor the offset of any field changes
> would change by the marker removal.

Correctly understood, Tyler.

The struct layout is unmodified, so it's not an ABI change.
However, it's an API change, because applications cannot access the field anymore.

Although DPDK itself doesn't use the field, other applications might use rte_prefetch0(&m->cacheline0) instead of rte_mbuf_prefetch_part1(m).
After checking in-house, I can mention at least one company doing that. ;-)

We should keep the cacheline0 field and not break the API. Not for my sake, but for other applications. :-)
  
Tyler Retzlaff Feb. 13, 2024, 6:45 a.m. UTC | #7
The zero sized RTE_MARKER<n> typedefs are a GCC extension unsupported by
MSVC.  Replace the use of the RTE_MARKER typedefs with anonymous unions.

Note:

v1 of the series tried to maintain the API after some study it has been
discovered that some existing uses of the markers do not produce compilation
failure but evaluate to unintended values in the absence of adaptation.
For this reason the existing markers cannot be removed because it is too hard
to identify what needs to be changed by consumers. While the ABI has been
maintained the subtle API change is just too risky.

The question I'm asking now is how to gracefully deprecate the markers
while allowing consumption of the struct on Windows.

I propose the following:

* Introduce the unions as per-this series except instead of adding members
  that match the original RTE_MARKER field names provide *new* names.
* Retain (conditionally compiled away on Windows) the existing RTE_MARKER
  fields with their original names.
* Convert in-tree code to use the new names in the unions.

The old names & markers would be announced for deprecation and eventually
removed and when they are the conditional compilation would also go away.

Thoughts?

v2:
    * Introduce additional union/struct to agnostically pad cachline0 to
      RTE_CACHE_LINE_MIN_SIZE without conditional compilation.
    * Adapt ixgbe access of rearm_data field.
    * Move ol_flags field out of rearm_data union where it didn't belong.
    * Added a couple of static_asserts for offset of cacheline1 and
      sizeof struct rte_mbuf.

Tyler Retzlaff (1):
  mbuf: replace GCC marker extension with C11 anonymous unions

 drivers/net/ionic/ionic_lif.c               |   8 +-
 drivers/net/ionic/ionic_rxtx_sg.c           |   4 +-
 drivers/net/ionic/ionic_rxtx_simple.c       |   2 +-
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c      |   8 +-
 drivers/net/sfc/sfc_ef100_rx.c              |   8 +-
 drivers/net/sfc/sfc_ef10_rx.c               |  12 +-
 drivers/net/virtio/virtio_rxtx_packed_avx.h |   8 +-
 lib/mbuf/rte_mbuf_core.h                    | 276 ++++++++++++++++------------
 8 files changed, 179 insertions(+), 147 deletions(-)
  
Bruce Richardson Feb. 13, 2024, 8:57 a.m. UTC | #8
On Mon, Feb 12, 2024 at 10:45:40PM -0800, Tyler Retzlaff wrote:
> The zero sized RTE_MARKER<n> typedefs are a GCC extension unsupported by
> MSVC.  Replace the use of the RTE_MARKER typedefs with anonymous unions.
> 
> Note:
> 
> v1 of the series tried to maintain the API after some study it has been
> discovered that some existing uses of the markers do not produce compilation
> failure but evaluate to unintended values in the absence of adaptation.
> For this reason the existing markers cannot be removed because it is too hard
> to identify what needs to be changed by consumers. While the ABI has been
> maintained the subtle API change is just too risky.
> 
> The question I'm asking now is how to gracefully deprecate the markers
> while allowing consumption of the struct on Windows.
> 
> I propose the following:
> 
> * Introduce the unions as per-this series except instead of adding members
>   that match the original RTE_MARKER field names provide *new* names.
> * Retain (conditionally compiled away on Windows) the existing RTE_MARKER
>   fields with their original names.
> * Convert in-tree code to use the new names in the unions.
> 
> The old names & markers would be announced for deprecation and eventually
> removed and when they are the conditional compilation would also go away.
> 
> Thoughts?
> 
This seems a good approach. +1 from me for the idea.

/Bruce
  
Morten Brørup Feb. 13, 2024, 5:09 p.m. UTC | #9
> From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> Sent: Tuesday, 13 February 2024 07.46
> 
> The zero sized RTE_MARKER<n> typedefs are a GCC extension unsupported
> by
> MSVC.  Replace the use of the RTE_MARKER typedefs with anonymous
> unions.
> 
> Note:
> 
> v1 of the series tried to maintain the API after some study it has been
> discovered that some existing uses of the markers do not produce
> compilation
> failure but evaluate to unintended values in the absence of adaptation.
> For this reason the existing markers cannot be removed because it is
> too hard
> to identify what needs to be changed by consumers. While the ABI has
> been
> maintained the subtle API change is just too risky.
> 
> The question I'm asking now is how to gracefully deprecate the markers
> while allowing consumption of the struct on Windows.
> 
> I propose the following:
> 
> * Introduce the unions as per-this series except instead of adding
> members
>   that match the original RTE_MARKER field names provide *new* names.
> * Retain (conditionally compiled away on Windows) the existing
> RTE_MARKER
>   fields with their original names.
> * Convert in-tree code to use the new names in the unions.
> 
> The old names & markers would be announced for deprecation and
> eventually
> removed and when they are the conditional compilation would also go
> away.
> 
> Thoughts?

Seems like the right thing to do!

The modified type of rearm_data might not be noticed by out-of-tree PMD developers, so using a new name for the new type reduces the risk.

If some of the markers maintain their type or get a compatible type (from an API perspective), they can keep their names.
  
Tyler Retzlaff Feb. 13, 2024, 11:33 p.m. UTC | #10
Here is the latest iteration of the proposed change to allow struct rte_mbuf
to be consumed by MSVC.

* Introduce an internal __rte_marker macro conditionally expanded for MSVC
  vs existing users of the struct. At some point we can uncomment __rte_deprecated
  to assist migration away from the current marker fields for applications
  after appropriate announcement periods etc..

* Introduce anonymous unions to allow aliasing of the previous named
  offsets by a *new* name.  The intention would be to convert the dpdk tree
  to use the new names along with this change and enable __rte_deprecated
  for dpdk builds (not applications) to avoid accidental re-introduction.

* The anonymous unions are now also used to pad cacheline0 and cacheline1 instead
  of __rte_cache_min_aligned.

* Converted the type of the fields for the named markers to char[] instead of
  uint8_t[].

Tyler Retzlaff (1):
  mbuf: deprecate GCC marker in rte mbuf struct

 lib/eal/include/rte_common.h |   6 +
 lib/mbuf/rte_mbuf_core.h     | 365 +++++++++++++++++++++++--------------------
 2 files changed, 201 insertions(+), 170 deletions(-)
  
Morten Brørup Feb. 14, 2024, 10:49 a.m. UTC | #11
> From: Tyler Retzlaff [mailto:roretzla@linux.microsoft.com]
> Sent: Wednesday, 14 February 2024 00.33
> 
> Here is the latest iteration of the proposed change to allow struct
> rte_mbuf
> to be consumed by MSVC.
> 
> * Introduce an internal __rte_marker macro conditionally expanded for
> MSVC
>   vs existing users of the struct. At some point we can uncomment
> __rte_deprecated
>   to assist migration away from the current marker fields for
> applications
>   after appropriate announcement periods etc..
> 
> * Introduce anonymous unions to allow aliasing of the previous named
>   offsets by a *new* name.  The intention would be to convert the dpdk
> tree
>   to use the new names along with this change and enable
> __rte_deprecated
>   for dpdk builds (not applications) to avoid accidental re-
> introduction.
> 
> * The anonymous unions are now also used to pad cacheline0 and
> cacheline1 instead
>   of __rte_cache_min_aligned.

ACK. This design also seems reusable for similar structures.
  
Stephen Hemminger Feb. 26, 2024, 1:18 a.m. UTC | #12
On Tue, 13 Feb 2024 15:33:28 -0800
Tyler Retzlaff <roretzla@linux.microsoft.com> wrote:

> Here is the latest iteration of the proposed change to allow struct rte_mbuf
> to be consumed by MSVC.
> 
> * Introduce an internal __rte_marker macro conditionally expanded for MSVC
>   vs existing users of the struct. At some point we can uncomment __rte_deprecated
>   to assist migration away from the current marker fields for applications
>   after appropriate announcement periods etc..
> 
> * Introduce anonymous unions to allow aliasing of the previous named
>   offsets by a *new* name.  The intention would be to convert the dpdk tree
>   to use the new names along with this change and enable __rte_deprecated
>   for dpdk builds (not applications) to avoid accidental re-introduction.
> 
> * The anonymous unions are now also used to pad cacheline0 and cacheline1 instead
>   of __rte_cache_min_aligned.
> 
> * Converted the type of the fields for the named markers to char[] instead of
>   uint8_t[].
> 
> Tyler Retzlaff (1):
>   mbuf: deprecate GCC marker in rte mbuf struct
> 
>  lib/eal/include/rte_common.h |   6 +
>  lib/mbuf/rte_mbuf_core.h     | 365 +++++++++++++++++++++++--------------------
>  2 files changed, 201 insertions(+), 170 deletions(-)
> 

I was never convinced that __rte_marker was good idea in the first place.

It seemed to be only useful as annotation or for use by pre-fetch.
The problem is that for annotation, it can easily be wrong if using different
cache size or structure chagnes.
For prefetch, just using the structure and pointer math based on cacheline
seems like a better option. Plus DPDK does excessive and unproved prefetching.
For real world cases prefetching doesn't help unless there is enough cycles
from when prefetch is issued and when it is used. If too long, the prefetch
is useless, if too close the extra overhead of the prefetch slows down the
intervening execution units.
  

Patch

diff --git a/drivers/net/ionic/ionic_lif.c b/drivers/net/ionic/ionic_lif.c
index 25b490d..fd99f39 100644
--- a/drivers/net/ionic/ionic_lif.c
+++ b/drivers/net/ionic/ionic_lif.c
@@ -725,8 +725,8 @@ 
 
 	rte_compiler_barrier();
 
-	RTE_BUILD_BUG_ON(sizeof(rxm.rearm_data[0]) != sizeof(uint64_t));
-	return rxm.rearm_data[0];
+	RTE_BUILD_BUG_ON(sizeof(rxm.rearm_data) != sizeof(uint64_t));
+	return rxm.rearm_data;
 }
 
 static uint64_t
@@ -743,8 +743,8 @@ 
 
 	rte_compiler_barrier();
 
-	RTE_BUILD_BUG_ON(sizeof(rxm.rearm_data[0]) != sizeof(uint64_t));
-	return rxm.rearm_data[0];
+	RTE_BUILD_BUG_ON(sizeof(rxm.rearm_data) != sizeof(uint64_t));
+	return rxm.rearm_data;
 }
 
 int
diff --git a/drivers/net/ionic/ionic_rxtx_sg.c b/drivers/net/ionic/ionic_rxtx_sg.c
index ab8e56e..a569dd1 100644
--- a/drivers/net/ionic/ionic_rxtx_sg.c
+++ b/drivers/net/ionic/ionic_rxtx_sg.c
@@ -285,7 +285,7 @@ 
 	info[0] = NULL;
 
 	/* Set the mbuf metadata based on the cq entry */
-	rxm->rearm_data[0] = rxq->rearm_data;
+	rxm->rearm_data = rxq->rearm_data;
 	rxm->pkt_len = cq_desc_len;
 	rxm->data_len = RTE_MIN(rxq->hdr_seg_size, cq_desc_len);
 	left = cq_desc_len - rxm->data_len;
@@ -298,7 +298,7 @@ 
 		info[i] = NULL;
 
 		/* Set the chained mbuf metadata */
-		rxm_seg->rearm_data[0] = rxq->rearm_seg_data;
+		rxm_seg->rearm_data = rxq->rearm_seg_data;
 		rxm_seg->data_len = RTE_MIN(rxq->seg_size, left);
 		left -= rxm_seg->data_len;
 
diff --git a/drivers/net/ionic/ionic_rxtx_simple.c b/drivers/net/ionic/ionic_rxtx_simple.c
index 5f81856..1978610 100644
--- a/drivers/net/ionic/ionic_rxtx_simple.c
+++ b/drivers/net/ionic/ionic_rxtx_simple.c
@@ -256,7 +256,7 @@ 
 	info[0] = NULL;
 
 	/* Set the mbuf metadata based on the cq entry */
-	rxm->rearm_data[0] = rxq->rearm_data;
+	rxm->rearm_data = rxq->rearm_data;
 	rxm->pkt_len = cq_desc_len;
 	rxm->data_len = cq_desc_len;
 
diff --git a/drivers/net/sfc/sfc_ef100_rx.c b/drivers/net/sfc/sfc_ef100_rx.c
index 2677003..23918d5 100644
--- a/drivers/net/sfc/sfc_ef100_rx.c
+++ b/drivers/net/sfc/sfc_ef100_rx.c
@@ -553,9 +553,9 @@  struct sfc_ef100_rxq {
 		pkt = sfc_ef100_rx_next_mbuf(rxq);
 		__rte_mbuf_raw_sanity_check(pkt);
 
-		RTE_BUILD_BUG_ON(sizeof(pkt->rearm_data[0]) !=
+		RTE_BUILD_BUG_ON(sizeof(pkt->rearm_data) !=
 				 sizeof(rxq->rearm_data));
-		pkt->rearm_data[0] = rxq->rearm_data;
+		pkt->rearm_data = rxq->rearm_data;
 
 		/* data_off already moved past Rx prefix */
 		rx_prefix = (const efx_xword_t *)sfc_ef100_rx_pkt_prefix(pkt);
@@ -759,8 +759,8 @@  struct sfc_ef100_rxq {
 
 	/* rearm_data covers structure members filled in above */
 	rte_compiler_barrier();
-	RTE_BUILD_BUG_ON(sizeof(m.rearm_data[0]) != sizeof(uint64_t));
-	return m.rearm_data[0];
+	RTE_BUILD_BUG_ON(sizeof(m.rearm_data) != sizeof(uint64_t));
+	return m.rearm_data;
 }
 
 static sfc_dp_rx_qcreate_t sfc_ef100_rx_qcreate;
diff --git a/drivers/net/sfc/sfc_ef10_rx.c b/drivers/net/sfc/sfc_ef10_rx.c
index 30a320d..60bc098 100644
--- a/drivers/net/sfc/sfc_ef10_rx.c
+++ b/drivers/net/sfc/sfc_ef10_rx.c
@@ -322,8 +322,8 @@  struct sfc_ef10_rxq {
 
 	m = rxd->mbuf;
 
-	RTE_BUILD_BUG_ON(sizeof(m->rearm_data[0]) != sizeof(rxq->rearm_data));
-	m->rearm_data[0] = rxq->rearm_data;
+	RTE_BUILD_BUG_ON(sizeof(m->rearm_data) != sizeof(rxq->rearm_data));
+	m->rearm_data = rxq->rearm_data;
 
 	/* Classify packet based on Rx event */
 	/* Mask RSS hash offload flag if RSS is not enabled */
@@ -377,9 +377,9 @@  struct sfc_ef10_rxq {
 			rxq->completed = pending;
 		}
 
-		RTE_BUILD_BUG_ON(sizeof(m->rearm_data[0]) !=
+		RTE_BUILD_BUG_ON(sizeof(m->rearm_data) !=
 				 sizeof(rxq->rearm_data));
-		m->rearm_data[0] = rxq->rearm_data;
+		m->rearm_data = rxq->rearm_data;
 
 		/* Event-dependent information is the same */
 		m->ol_flags = m0->ol_flags;
@@ -633,8 +633,8 @@  struct sfc_ef10_rxq {
 
 	/* rearm_data covers structure members filled in above */
 	rte_compiler_barrier();
-	RTE_BUILD_BUG_ON(sizeof(m.rearm_data[0]) != sizeof(uint64_t));
-	return m.rearm_data[0];
+	RTE_BUILD_BUG_ON(sizeof(m.rearm_data) != sizeof(uint64_t));
+	return m.rearm_data;
 }
 
 static sfc_dp_rx_qcreate_t sfc_ef10_rx_qcreate;
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.h b/drivers/net/virtio/virtio_rxtx_packed_avx.h
index 584ac72..a9ce53f 100644
--- a/drivers/net/virtio/virtio_rxtx_packed_avx.h
+++ b/drivers/net/virtio/virtio_rxtx_packed_avx.h
@@ -36,10 +36,10 @@ 
 	/* Load four mbufs rearm data */
 	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
 	RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64);
-	__m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data,
-					  *tx_pkts[2]->rearm_data,
-					  *tx_pkts[1]->rearm_data,
-					  *tx_pkts[0]->rearm_data);
+	__m256i mbufs = _mm256_set_epi64x(tx_pkts[3]->rearm_data,
+					  tx_pkts[2]->rearm_data,
+					  tx_pkts[1]->rearm_data,
+					  tx_pkts[0]->rearm_data);
 
 	/* refcnt=1 and nb_segs=1 */
 	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index 5688683..d731ea0 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -464,9 +464,10 @@  enum {
  * The generic rte_mbuf, containing a packet mbuf.
  */
 struct rte_mbuf {
-	RTE_MARKER cacheline0;
-
-	void *buf_addr;           /**< Virtual address of segment buffer. */
+	union {
+	    void *cacheline0;
+	    void *buf_addr;           /**< Virtual address of segment buffer. */
+	};
 #if RTE_IOVA_IN_MBUF
 	/**
 	 * Physical address of segment buffer.
@@ -487,69 +488,77 @@  struct rte_mbuf {
 #endif
 
 	/* next 8 bytes are initialised on RX descriptor rearm */
-	RTE_MARKER64 rearm_data;
-	uint16_t data_off;
-
-	/**
-	 * Reference counter. Its size should at least equal to the size
-	 * of port field (16 bits), to support zero-copy broadcast.
-	 * It should only be accessed using the following functions:
-	 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
-	 * rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
-	 * or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC flag.
-	 */
-	RTE_ATOMIC(uint16_t) refcnt;
+	union {
+		uint64_t rearm_data;
+		struct {
+			uint16_t data_off;
+
+			/**
+			 * Reference counter. Its size should at least equal to the size
+			 * of port field (16 bits), to support zero-copy broadcast.
+			 * It should only be accessed using the following functions:
+			 * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and
+			 * rte_mbuf_refcnt_set(). The functionality of these functions (atomic,
+			 * or non-atomic) is controlled by the RTE_MBUF_REFCNT_ATOMIC flag.
+			 */
+			RTE_ATOMIC(uint16_t) refcnt;
 
-	/**
-	 * Number of segments. Only valid for the first segment of an mbuf
-	 * chain.
-	 */
-	uint16_t nb_segs;
+			/**
+			 * Number of segments. Only valid for the first segment of an mbuf
+			 * chain.
+			 */
+			uint16_t nb_segs;
 
-	/** Input port (16 bits to support more than 256 virtual ports).
-	 * The event eth Tx adapter uses this field to specify the output port.
-	 */
-	uint16_t port;
+			/** Input port (16 bits to support more than 256 virtual ports).
+			 * The event eth Tx adapter uses this field to specify the output port.
+			 */
+			uint16_t port;
 
-	uint64_t ol_flags;        /**< Offload features. */
+			uint64_t ol_flags;        /**< Offload features. */
+		};
+	};
 
 	/* remaining bytes are set on RX when pulling packet from descriptor */
-	RTE_MARKER rx_descriptor_fields1;
-
-	/*
-	 * The packet type, which is the combination of outer/inner L2, L3, L4
-	 * and tunnel types. The packet_type is about data really present in the
-	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
-	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
-	 * vlan is stripped from the data.
-	 */
 	union {
-		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
-		__extension__
+		void *rx_descriptor_fields1;
+
+		/*
+		 * The packet type, which is the combination of outer/inner L2, L3, L4
+		 * and tunnel types. The packet_type is about data really present in the
+		 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
+		 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
+		 * vlan is stripped from the data.
+		 */
 		struct {
-			uint8_t l2_type:4;   /**< (Outer) L2 type. */
-			uint8_t l3_type:4;   /**< (Outer) L3 type. */
-			uint8_t l4_type:4;   /**< (Outer) L4 type. */
-			uint8_t tun_type:4;  /**< Tunnel type. */
 			union {
-				uint8_t inner_esp_next_proto;
-				/**< ESP next protocol type, valid if
-				 * RTE_PTYPE_TUNNEL_ESP tunnel type is set
-				 * on both Tx and Rx.
-				 */
+				uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
 				__extension__
 				struct {
-					uint8_t inner_l2_type:4;
-					/**< Inner L2 type. */
-					uint8_t inner_l3_type:4;
-					/**< Inner L3 type. */
+					uint8_t l2_type:4;   /**< (Outer) L2 type. */
+					uint8_t l3_type:4;   /**< (Outer) L3 type. */
+					uint8_t l4_type:4;   /**< (Outer) L4 type. */
+					uint8_t tun_type:4;  /**< Tunnel type. */
+					union {
+						uint8_t inner_esp_next_proto;
+						/**< ESP next protocol type, valid if
+						 * RTE_PTYPE_TUNNEL_ESP tunnel type is set
+						 * on both Tx and Rx.
+						 */
+						__extension__
+						struct {
+							uint8_t inner_l2_type:4;
+							/**< Inner L2 type. */
+							uint8_t inner_l3_type:4;
+							/**< Inner L3 type. */
+						};
+					};
+					uint8_t inner_l4_type:4; /**< Inner L4 type. */
 				};
 			};
-			uint8_t inner_l4_type:4; /**< Inner L4 type. */
+			uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
 		};
 	};
 
-	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
 	uint16_t data_len;        /**< Amount of data in segment buffer. */
 	/** VLAN TCI (CPU order), valid if RTE_MBUF_F_RX_VLAN is set. */
 	uint16_t vlan_tci;
@@ -595,21 +604,23 @@  struct rte_mbuf {
 	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
 
 	/* second cache line - fields only used in slow path or on TX */
-	RTE_MARKER cacheline1 __rte_cache_min_aligned;
+	union {
+		void *cacheline1;
 
 #if RTE_IOVA_IN_MBUF
-	/**
-	 * Next segment of scattered packet. Must be NULL in the last
-	 * segment or in case of non-segmented packet.
-	 */
-	struct rte_mbuf *next;
+		/**
+		 * Next segment of scattered packet. Must be NULL in the last
+		 * segment or in case of non-segmented packet.
+		 */
+		struct rte_mbuf *next;
 #else
-	/**
-	 * Reserved for dynamic fields
-	 * when the next pointer is in first cache line (i.e. RTE_IOVA_IN_MBUF is 0).
-	 */
-	uint64_t dynfield2;
+		/**
+		 * Reserved for dynamic fields
+		 * when the next pointer is in first cache line (i.e. RTE_IOVA_IN_MBUF is 0).
+		 */
+		uint64_t dynfield2;
 #endif
+	};
 
 	/* fields to support TX offloads */
 	union {