[v2,1/2] examples/l3fwd: common packet group functionality

Message ID 20220617074241.3260496-1-rbhansali@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series [v2,1/2] examples/l3fwd: common packet group functionality |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Rahul Bhansali June 17, 2022, 7:42 a.m. UTC
  This will make the packet grouping function common, so
that other examples can utilize as per need.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: New patch to address review comment.

 examples/common/neon_common.h |  50 ++++++++++++
 examples/common/pkt_group.h   | 139 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/Makefile       |   5 +-
 examples/l3fwd/l3fwd.h        |   2 -
 examples/l3fwd/l3fwd_common.h | 129 +------------------------------
 examples/l3fwd/l3fwd_neon.h   |  43 +----------
 examples/meson.build          |   2 +-
 7 files changed, 198 insertions(+), 172 deletions(-)
 create mode 100644 examples/common/neon_common.h
 create mode 100644 examples/common/pkt_group.h

--
2.25.1
  

Comments

Rahul Bhansali June 17, 2022, 7:50 a.m. UTC | #1
CC: Konstantin Ananyev

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 1:13 PM
> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> <rbhansali@marvell.com>
> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
> 
> This will make the packet grouping function common, so that other examples
> can utilize as per need.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v2: New patch to address review comment.
> 
>  examples/common/neon_common.h |  50 ++++++++++++
>  examples/common/pkt_group.h   | 139
> ++++++++++++++++++++++++++++++++++
>  examples/l3fwd/Makefile       |   5 +-
>  examples/l3fwd/l3fwd.h        |   2 -
>  examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>  examples/l3fwd/l3fwd_neon.h   |  43 +----------
>  examples/meson.build          |   2 +-
>  7 files changed, 198 insertions(+), 172 deletions(-)  create mode 100644
> examples/common/neon_common.h  create mode 100644
> examples/common/pkt_group.h
> 
> diff --git a/examples/common/neon_common.h
> b/examples/common/neon_common.h new file mode 100644 index
> 0000000000..f01b5ab6bc
> --- /dev/null
> +++ b/examples/common/neon_common.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2018 Intel Corporation.
> + * Copyright(c) 2017-2018 Linaro Limited.
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _NEON_COMMON_H_
> +#define _NEON_COMMON_H_
> +
> +#include "pkt_group.h"
> +
> +/*
> + * Group consecutive packets with the same destination port in bursts of 4.
> + * Suppose we have array of destination ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisons at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +		  uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +	int32_t v;
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +	rte_compiler_barrier();
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +#endif /* _NEON_COMMON_H_ */
> diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
> new file mode 100644 index 0000000000..8b26d9380f
> --- /dev/null
> +++ b/examples/common/pkt_group.h
> @@ -0,0 +1,139 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2018 Intel Corporation.
> + * Copyright(c) 2017-2018 Linaro Limited.
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _PKT_GROUP_H_
> +#define _PKT_GROUP_H_
> +
> +#define FWDSTEP	4
> +
> +/*
> + * Group consecutive packets with the same destination port into one burst.
> + * To avoid extra latency this is done together with some other packet
> + * processing, but after we made a final decision about packet's destination.
> + * To do this we maintain:
> + * pnum - array of number of consecutive packets with the same dest
> +port for
> + * each packet in the input burst.
> + * lp - pointer to the last updated element in the pnum.
> + * dlp - dest port value lp corresponds to.
> + */
> +
> +#define	GRPSZ	(1 << FWDSTEP)
> +#define	GRPMSK	(GRPSZ - 1)
> +
> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> +		(lp)[0]++;                           \
> +	} else {                                     \
> +		(dlp) = (dcp)[idx];                  \
> +		(lp) = (pn) + (idx);                 \
> +		(lp)[0] = 1;                         \
> +	}                                            \
> +} while (0)
> +
> +static const struct {
> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> +	int32_t  idx;  /* index for new last updated elemnet. */
> +	uint16_t lpv;  /* add value to the last updated element. */ }
> +gptbl[GRPSZ] = {
> +	{
> +		/* 0: a != b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 1: a == b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 2: a != b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 3: a == b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020003),
> +		.idx = 4,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 4: a != b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 5: a == b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 6: a != b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 7: a == b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030004),
> +		.idx = 4,
> +		.lpv = 3,
> +	},
> +	{
> +		/* 8: a != b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 9: a == b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010002),
> +		.idx = 3,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xa: a != b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xb: a == b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020003),
> +		.idx = 3,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 0xc: a != b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010001),
> +		.idx = 2,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xd: a == b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010002),
> +		.idx = 2,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xe: a != b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040001),
> +		.idx = 1,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xf: a == b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040005),
> +		.idx = 0,
> +		.lpv = 4,
> +	},
> +};
> +
> +#endif /* _PKT_GROUP_H_ */
> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
> 8efe6378e2..8dbe85c2e6 100644
> --- a/examples/l3fwd/Makefile
> +++ b/examples/l3fwd/Makefile
> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>  static: build/$(APP)-static
>  	ln -sf $(APP)-static build/$(APP)
> 
> +INCLUDES =-I../common
>  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
> $(shell $(PKGCONF) --cflags libdpdk)  # Added for 'rte_eth_link_to_str()'
> @@ -38,10 +39,10 @@ endif
>  endif
> 
>  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_SHARED)
> 
>  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_STATIC)
> 
>  build:
>  	@mkdir -p $@
> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
> 8a52c90755..40b5f32a9e 100644
> --- a/examples/l3fwd/l3fwd.h
> +++ b/examples/l3fwd/l3fwd.h
> @@ -44,8 +44,6 @@
>  /* Used to mark destination port as 'invalid'. */
>  #define	BAD_PORT ((uint16_t)-1)
> 
> -#define FWDSTEP	4
> -
>  /* replace first 12B of the ethernet header. */
>  #define	MASK_ETH 0x3f
> 
> diff --git a/examples/l3fwd/l3fwd_common.h
> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
> --- a/examples/l3fwd/l3fwd_common.h
> +++ b/examples/l3fwd/l3fwd_common.h
> @@ -7,6 +7,8 @@
>  #ifndef _L3FWD_COMMON_H_
>  #define _L3FWD_COMMON_H_
> 
> +#include "pkt_group.h"
> +
>  #ifdef DO_RFC_1812_CHECKS
> 
>  #define	IPV4_MIN_VER_IHL	0x45
> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t
> *dp, uint32_t ptype)
>  #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>  #endif /* DO_RFC_1812_CHECKS */
> 
> -/*
> - * We group consecutive packets with the same destination port into one burst.
> - * To avoid extra latency this is done together with some other packet
> - * processing, but after we made a final decision about packet's destination.
> - * To do this we maintain:
> - * pnum - array of number of consecutive packets with the same dest port for
> - * each packet in the input burst.
> - * lp - pointer to the last updated element in the pnum.
> - * dlp - dest port value lp corresponds to.
> - */
> -
> -#define	GRPSZ	(1 << FWDSTEP)
> -#define	GRPMSK	(GRPSZ - 1)
> -
> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> -	if (likely((dlp) == (dcp)[(idx)])) {             \
> -		(lp)[0]++;                                   \
> -	} else {                                         \
> -		(dlp) = (dcp)[idx];                          \
> -		(lp) = (pn) + (idx);                         \
> -		(lp)[0] = 1;                                 \
> -	}                                                \
> -} while (0)
> -
> -static const struct {
> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> -	int32_t  idx;  /* index for new last updated element. */
> -	uint16_t lpv;  /* add value to the last updated element. */
> -} gptbl[GRPSZ] = {
> -	{
> -		/* 0: a != b, b != c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100010001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 1: a == b, b != c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100010002),
> -		.idx = 4,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 2: a != b, b == c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100020001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 3: a == b, b == c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100020003),
> -		.idx = 4,
> -		.lpv = 2,
> -	},
> -	{
> -		/* 4: a != b, b != c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200010001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 5: a == b, b != c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200010002),
> -		.idx = 4,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 6: a != b, b == c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200030001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 7: a == b, b == c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200030004),
> -		.idx = 4,
> -		.lpv = 3,
> -	},
> -	{
> -		/* 8: a != b, b != c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100010001),
> -		.idx = 3,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 9: a == b, b != c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100010002),
> -		.idx = 3,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 0xa: a != b, b == c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100020001),
> -		.idx = 3,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xb: a == b, b == c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100020003),
> -		.idx = 3,
> -		.lpv = 2,
> -	},
> -	{
> -		/* 0xc: a != b, b != c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300010001),
> -		.idx = 2,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xd: a == b, b != c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300010002),
> -		.idx = 2,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 0xe: a != b, b == c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300040001),
> -		.idx = 1,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xf: a == b, b == c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300040005),
> -		.idx = 0,
> -		.lpv = 4,
> -	},
> -};
> -
>  static __rte_always_inline void
>  send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>  		uint32_t num)
> diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
> index e3d33a5229..5fa765b640 100644
> --- a/examples/l3fwd/l3fwd_neon.h
> +++ b/examples/l3fwd/l3fwd_neon.h
> @@ -7,6 +7,7 @@
>  #define _L3FWD_NEON_H_
> 
>  #include "l3fwd.h"
> +#include "neon_common.h"
>  #include "l3fwd_common.h"
> 
>  /*
> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
> uint16_t dst_port[FWDSTEP])
>  			&dst_port[3], pkt[3]->packet_type);
>  }
> 
> -/*
> - * Group consecutive packets with the same destination port in bursts of 4.
> - * Suppose we have array of destination ports:
> - * dst_port[] = {a, b, c, d,, e, ... }
> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> - * We doing 4 comparisons at once and the result is 4 bit mask.
> - * This mask is used as an index into prebuild array of pnum values.
> - */
> -static inline uint16_t *
> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> -	     uint16x8_t dp2)
> -{
> -	union {
> -		uint16_t u16[FWDSTEP + 1];
> -		uint64_t u64;
> -	} *pnum = (void *)pn;
> -
> -	int32_t v;
> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> -
> -	dp1 = vceqq_u16(dp1, dp2);
> -	dp1 = vandq_u16(dp1, mask);
> -	v = vaddvq_u16(dp1);
> -
> -	/* update last port counter. */
> -	lp[0] += gptbl[v].lpv;
> -	rte_compiler_barrier();
> -
> -	/* if dest port value has changed. */
> -	if (v != GRPMSK) {
> -		pnum->u64 = gptbl[v].pnum;
> -		pnum->u16[FWDSTEP] = 1;
> -		lp = pnum->u16 + gptbl[v].idx;
> -	}
> -
> -	return lp;
> -}
> -
>  /**
>   * Process one packet:
>   * Update source and destination MAC addresses in the ethernet header.
> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
> rte_mbuf **pkts_burst,
>  			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>  			 */
>  			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> dp2);
> 
>  			/*
>  			 * dp1:
> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
> rte_mbuf **pkts_burst,
>  		 */
>  		dp2 = vextq_u16(dp1, dp1, 1);
>  		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> 
>  		/*
>  		 * remove values added by the last repeated diff --git
> a/examples/meson.build b/examples/meson.build index
> 78de0e1f37..81e93799f2 100644
> --- a/examples/meson.build
> +++ b/examples/meson.build
> @@ -97,7 +97,7 @@ foreach example: examples
>      ldflags = default_ldflags
> 
>      ext_deps = []
> -    includes = [include_directories(example)]
> +    includes = [include_directories(example, 'common')]
>      deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>      subdir(example)
> 
> --
> 2.25.1
  
Akhil Goyal June 20, 2022, 7:49 a.m. UTC | #2
> This will make the packet grouping function common, so
> that other examples can utilize as per need.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---

Adding more people for review.

@thomas@monjalon.net: Can this patch be taken in next-crypto as the patch is 
primarily for ipsec-secgw?

> Changes in v2: New patch to address review comment.
> 
>  examples/common/neon_common.h |  50 ++++++++++++
>  examples/common/pkt_group.h   | 139
> ++++++++++++++++++++++++++++++++++
>  examples/l3fwd/Makefile       |   5 +-
>  examples/l3fwd/l3fwd.h        |   2 -
>  examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>  examples/l3fwd/l3fwd_neon.h   |  43 +----------
>  examples/meson.build          |   2 +-
>  7 files changed, 198 insertions(+), 172 deletions(-)
>  create mode 100644 examples/common/neon_common.h
>  create mode 100644 examples/common/pkt_group.h
  
Thomas Monjalon June 20, 2022, 10:45 a.m. UTC | #3
20/06/2022 09:49, Akhil Goyal:
> @thomas@monjalon.net: Can this patch be taken in next-crypto as the patch is 
> primarily for ipsec-secgw?

Yes that's fine.
  
Konstantin Ananyev June 20, 2022, 11:13 p.m. UTC | #4
17/06/2022 08:50, Rahul Bhansali пишет:
> CC: Konstantin Ananyev
> 
>> -----Original Message-----
>> From: Rahul Bhansali <rbhansali@marvell.com>
>> Sent: Friday, June 17, 2022 1:13 PM
>> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
>> <rbhansali@marvell.com>
>> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
>>
>> This will make the packet grouping function common, so that other examples
>> can utilize as per need.
>>
>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>> ---
>> Changes in v2: New patch to address review comment.
>>
>>   examples/common/neon_common.h |  50 ++++++++++++
>>   examples/common/pkt_group.h   | 139
>> ++++++++++++++++++++++++++++++++++
>>   examples/l3fwd/Makefile       |   5 +-
>>   examples/l3fwd/l3fwd.h        |   2 -
>>   examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>>   examples/l3fwd/l3fwd_neon.h   |  43 +----------
>>   examples/meson.build          |   2 +-
>>   7 files changed, 198 insertions(+), 172 deletions(-)  create mode 100644
>> examples/common/neon_common.h  create mode 100644
>> examples/common/pkt_group.h
>>
>> diff --git a/examples/common/neon_common.h
>> b/examples/common/neon_common.h new file mode 100644 index
>> 0000000000..f01b5ab6bc
>> --- /dev/null
>> +++ b/examples/common/neon_common.h
>> @@ -0,0 +1,50 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2016-2018 Intel Corporation.
>> + * Copyright(c) 2017-2018 Linaro Limited.
>> + * Copyright(C) 2022 Marvell.
>> + */
>> +
>> +#ifndef _NEON_COMMON_H_
>> +#define _NEON_COMMON_H_
>> +
>> +#include "pkt_group.h"
>> +
>> +/*
>> + * Group consecutive packets with the same destination port in bursts of 4.
>> + * Suppose we have array of destination ports:
>> + * dst_port[] = {a, b, c, d,, e, ... }
>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>> + * This mask is used as an index into prebuild array of pnum values.
>> + */
>> +static inline uint16_t *
>> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>> +		  uint16x8_t dp2)
>> +{
>> +	union {
>> +		uint16_t u16[FWDSTEP + 1];
>> +		uint64_t u64;
>> +	} *pnum = (void *)pn;
>> +
>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>> +	int32_t v;
>> +
>> +	dp1 = vceqq_u16(dp1, dp2);
>> +	dp1 = vandq_u16(dp1, mask);
>> +	v = vaddvq_u16(dp1);
>> +
>> +	/* update last port counter. */hh
>> +	lp[0] += gptbl[v].lpv;
>> +	rte_compiler_barrier();
>> +
>> +	/* if dest port value has changed. */
>> +	if (v != GRPMSK) {
>> +		pnum->u64 = gptbl[v].pnum;
>> +		pnum->u16[FWDSTEP] = 1;
>> +		lp = pnum->u16 + gptbl[v].idx;
>> +	}
>> +
>> +	return lp;
>> +}

Thanks for the effort.
As I can see this function: port_groupx4() is nearly identical for all 3 
platforms: sse/nenon/altivec (except of course built-in arch-specific 
instincts).
In fact, even comemnts are identical.
I wonder can we have something like:
examples/common/<arch>/port_group.h
and for each arch will have defined port_groupx4(...)
?

>> +
>> +#endif /* _NEON_COMMON_H_ */
>> diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
>> new file mode 100644 index 0000000000..8b26d9380f
>> --- /dev/null
>> +++ b/examples/common/pkt_group.h
>> @@ -0,0 +1,139 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2016-2018 Intel Corporation.
>> + * Copyright(c) 2017-2018 Linaro Limited.
>> + * Copyright(C) 2022 Marvell.
>> + */
>> +
>> +#ifndef _PKT_GROUP_H_
>> +#define _PKT_GROUP_H_
>> +
>> +#define FWDSTEP	4
>> +
>> +/*
>> + * Group consecutive packets with the same destination port into one burst.
>> + * To avoid extra latency this is done together with some other packet
>> + * processing, but after we made a final decision about packet's destination.
>> + * To do this we maintain:
>> + * pnum - array of number of consecutive packets with the same dest
>> +port for
>> + * each packet in the input burst.
>> + * lp - pointer to the last updated element in the pnum.
>> + * dlp - dest port value lp corresponds to.
>> + */
>> +
>> +#define	GRPSZ	(1 << FWDSTEP)
>> +#define	GRPMSK	(GRPSZ - 1)
>> +
>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>> +		(lp)[0]++;                           \
>> +	} else {                                     \
>> +		(dlp) = (dcp)[idx];                  \
>> +		(lp) = (pn) + (idx);                 \
>> +		(lp)[0] = 1;                         \
>> +	}                                            \
>> +} while (0)
>> +
>> +static const struct {
>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>> +	int32_t  idx;  /* index for new last updated elemnet. */
>> +	uint16_t lpv;  /* add value to the last updated element. */ }
>> +gptbl[GRPSZ] = {
>> +	{
>> +		/* 0: a != b, b != c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100010001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 1: a == b, b != c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100010002),
>> +		.idx = 4,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 2: a != b, b == c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100020001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 3: a == b, b == c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100020003),
>> +		.idx = 4,
>> +		.lpv = 2,
>> +	},
>> +	{
>> +		/* 4: a != b, b != c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200010001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 5: a == b, b != c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200010002),
>> +		.idx = 4,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 6: a != b, b == c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200030001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 7: a == b, b == c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200030004),
>> +		.idx = 4,
>> +		.lpv = 3,
>> +	},
>> +	{
>> +		/* 8: a != b, b != c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100010001),
>> +		.idx = 3,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 9: a == b, b != c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100010002),
>> +		.idx = 3,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 0xa: a != b, b == c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100020001),
>> +		.idx = 3,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xb: a == b, b == c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100020003),
>> +		.idx = 3,
>> +		.lpv = 2,
>> +	},
>> +	{
>> +		/* 0xc: a != b, b != c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300010001),
>> +		.idx = 2,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xd: a == b, b != c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300010002),
>> +		.idx = 2,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 0xe: a != b, b == c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300040001),
>> +		.idx = 1,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xf: a == b, b == c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300040005),
>> +		.idx = 0,
>> +		.lpv = 4,
>> +	},
>> +};
>> +
>> +#endif /* _PKT_GROUP_H_ */
>> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
>> 8efe6378e2..8dbe85c2e6 100644
>> --- a/examples/l3fwd/Makefile
>> +++ b/examples/l3fwd/Makefile
>> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>>   static: build/$(APP)-static
>>   	ln -sf $(APP)-static build/$(APP)
>>
>> +INCLUDES =-I../common
>>   PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
>> $(shell $(PKGCONF) --cflags libdpdk)  # Added for 'rte_eth_link_to_str()'
>> @@ -38,10 +39,10 @@ endif
>>   endif
>>
>>   build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>> +$(LDFLAGS_SHARED)
>>
>>   build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>> +$(LDFLAGS_STATIC)
>>
>>   build:
>>   	@mkdir -p $@
>> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
>> 8a52c90755..40b5f32a9e 100644
>> --- a/examples/l3fwd/l3fwd.h
>> +++ b/examples/l3fwd/l3fwd.h
>> @@ -44,8 +44,6 @@
>>   /* Used to mark destination port as 'invalid'. */
>>   #define	BAD_PORT ((uint16_t)-1)
>>
>> -#define FWDSTEP	4
>> -
>>   /* replace first 12B of the ethernet header. */
>>   #define	MASK_ETH 0x3f
>>
>> diff --git a/examples/l3fwd/l3fwd_common.h
>> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
>> --- a/examples/l3fwd/l3fwd_common.h
>> +++ b/examples/l3fwd/l3fwd_common.h
>> @@ -7,6 +7,8 @@
>>   #ifndef _L3FWD_COMMON_H_
>>   #define _L3FWD_COMMON_H_
>>
>> +#include "pkt_group.h"
>> +
>>   #ifdef DO_RFC_1812_CHECKS
>>
>>   #define	IPV4_MIN_VER_IHL	0x45
>> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t
>> *dp, uint32_t ptype)
>>   #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>>   #endif /* DO_RFC_1812_CHECKS */
>>
>> -/*
>> - * We group consecutive packets with the same destination port into one burst.
>> - * To avoid extra latency this is done together with some other packet
>> - * processing, but after we made a final decision about packet's destination.
>> - * To do this we maintain:
>> - * pnum - array of number of consecutive packets with the same dest port for
>> - * each packet in the input burst.
>> - * lp - pointer to the last updated element in the pnum.
>> - * dlp - dest port value lp corresponds to.
>> - */
>> -
>> -#define	GRPSZ	(1 << FWDSTEP)
>> -#define	GRPMSK	(GRPSZ - 1)
>> -
>> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>> -	if (likely((dlp) == (dcp)[(idx)])) {             \
>> -		(lp)[0]++;                                   \
>> -	} else {                                         \
>> -		(dlp) = (dcp)[idx];                          \
>> -		(lp) = (pn) + (idx);                         \
>> -		(lp)[0] = 1;                                 \
>> -	}                                                \
>> -} while (0)
>> -
>> -static const struct {
>> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>> -	int32_t  idx;  /* index for new last updated element. */
>> -	uint16_t lpv;  /* add value to the last updated element. */
>> -} gptbl[GRPSZ] = {
>> -	{
>> -		/* 0: a != b, b != c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100010001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 1: a == b, b != c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100010002),
>> -		.idx = 4,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 2: a != b, b == c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100020001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 3: a == b, b == c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100020003),
>> -		.idx = 4,
>> -		.lpv = 2,
>> -	},
>> -	{
>> -		/* 4: a != b, b != c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200010001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 5: a == b, b != c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200010002),
>> -		.idx = 4,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 6: a != b, b == c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200030001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 7: a == b, b == c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200030004),
>> -		.idx = 4,
>> -		.lpv = 3,
>> -	},
>> -	{
>> -		/* 8: a != b, b != c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100010001),
>> -		.idx = 3,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 9: a == b, b != c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100010002),
>> -		.idx = 3,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 0xa: a != b, b == c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100020001),
>> -		.idx = 3,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xb: a == b, b == c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100020003),
>> -		.idx = 3,
>> -		.lpv = 2,
>> -	},
>> -	{
>> -		/* 0xc: a != b, b != c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300010001),
>> -		.idx = 2,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xd: a == b, b != c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300010002),
>> -		.idx = 2,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 0xe: a != b, b == c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300040001),
>> -		.idx = 1,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xf: a == b, b == c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300040005),
>> -		.idx = 0,
>> -		.lpv = 4,
>> -	},
>> -};
>> -
>>   static __rte_always_inline void
>>   send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>>   		uint32_t num)
>> diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
>> index e3d33a5229..5fa765b640 100644
>> --- a/examples/l3fwd/l3fwd_neon.h
>> +++ b/examples/l3fwd/l3fwd_neon.h
>> @@ -7,6 +7,7 @@
>>   #define _L3FWD_NEON_H_
>>
>>   #include "l3fwd.h"
>> +#include "neon_common.h"
>>   #include "l3fwd_common.h"
>>
>>   /*
>> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
>> uint16_t dst_port[FWDSTEP])
>>   			&dst_port[3], pkt[3]->packet_type);
>>   }
>>
>> -/*
>> - * Group consecutive packets with the same destination port in bursts of 4.
>> - * Suppose we have array of destination ports:
>> - * dst_port[] = {a, b, c, d,, e, ... }
>> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>> - * We doing 4 comparisons at once and the result is 4 bit mask.
>> - * This mask is used as an index into prebuild array of pnum values.
>> - */
>> -static inline uint16_t *
>> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>> -	     uint16x8_t dp2)
>> -{
>> -	union {
>> -		uint16_t u16[FWDSTEP + 1];
>> -		uint64_t u64;
>> -	} *pnum = (void *)pn;
>> -
>> -	int32_t v;
>> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>> -
>> -	dp1 = vceqq_u16(dp1, dp2);
>> -	dp1 = vandq_u16(dp1, mask);
>> -	v = vaddvq_u16(dp1);
>> -
>> -	/* update last port counter. */
>> -	lp[0] += gptbl[v].lpv;
>> -	rte_compiler_barrier();
>> -
>> -	/* if dest port value has changed. */
>> -	if (v != GRPMSK) {
>> -		pnum->u64 = gptbl[v].pnum;
>> -		pnum->u16[FWDSTEP] = 1;
>> -		lp = pnum->u16 + gptbl[v].idx;
>> -	}
>> -
>> -	return lp;
>> -}
>> -
>>   /**
>>    * Process one packet:
>>    * Update source and destination MAC addresses in the ethernet header.
>> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts_burst,
>>   			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>   			 */
>>   			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
>> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
>> dp2);
>>
>>   			/*
>>   			 * dp1:
>> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts_burst,
>>   		 */
>>   		dp2 = vextq_u16(dp1, dp1, 1);
>>   		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>
>>   		/*
>>   		 * remove values added by the last repeated diff --git
>> a/examples/meson.build b/examples/meson.build index
>> 78de0e1f37..81e93799f2 100644
>> --- a/examples/meson.build
>> +++ b/examples/meson.build
>> @@ -97,7 +97,7 @@ foreach example: examples
>>       ldflags = default_ldflags
>>
>>       ext_deps = []
>> -    includes = [include_directories(example)]
>> +    includes = [include_directories(example, 'common')]
>>       deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>>       subdir(example)
>>
>> --
>> 2.25.1
>
  
Akhil Goyal June 21, 2022, 12:56 p.m. UTC | #5
> 
> > This will make the packet grouping function common, so
> > that other examples can utilize as per need.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
Acked-by: Akhil Goyal <gakhil@marvell.com>
  
Rahul Bhansali June 21, 2022, 4:50 p.m. UTC | #6
> -----Original Message-----
> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Sent: Tuesday, June 21, 2022 4:43 AM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group
> functionality
> 
> External Email
> 
> ----------------------------------------------------------------------
> 17/06/2022 08:50, Rahul Bhansali пишет:
> > CC: Konstantin Ananyev
> >
> >> -----Original Message-----
> >> From: Rahul Bhansali <rbhansali@marvell.com>
> >> Sent: Friday, June 17, 2022 1:13 PM
> >> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
> >> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> >> <rbhansali@marvell.com>
> >> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group
> >> functionality
> >>
> >> This will make the packet grouping function common, so that other
> >> examples can utilize as per need.
> >>
> >> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> >> ---
> >> Changes in v2: New patch to address review comment.
> >>
> >>   examples/common/neon_common.h |  50 ++++++++++++
> >>   examples/common/pkt_group.h   | 139
> >> ++++++++++++++++++++++++++++++++++
> >>   examples/l3fwd/Makefile       |   5 +-
> >>   examples/l3fwd/l3fwd.h        |   2 -
> >>   examples/l3fwd/l3fwd_common.h | 129 +------------------------------
> >>   examples/l3fwd/l3fwd_neon.h   |  43 +----------
> >>   examples/meson.build          |   2 +-
> >>   7 files changed, 198 insertions(+), 172 deletions(-)  create mode
> >> 100644 examples/common/neon_common.h  create mode 100644
> >> examples/common/pkt_group.h
> >>
> >> diff --git a/examples/common/neon_common.h
> >> b/examples/common/neon_common.h new file mode 100644 index
> >> 0000000000..f01b5ab6bc
> >> --- /dev/null
> >> +++ b/examples/common/neon_common.h
> >> @@ -0,0 +1,50 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _NEON_COMMON_H_
> >> +#define _NEON_COMMON_H_
> >> +
> >> +#include "pkt_group.h"
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port in bursts of 4.
> >> + * Suppose we have array of destination ports:
> >> + * dst_port[] = {a, b, c, d,, e, ... }
> >> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> + * We doing 4 comparisons at once and the result is 4 bit mask.
> >> + * This mask is used as an index into prebuild array of pnum values.
> >> + */
> >> +static inline uint16_t *
> >> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t
> dp1,
> >> +		  uint16x8_t dp2)
> >> +{
> >> +	union {
> >> +		uint16_t u16[FWDSTEP + 1];
> >> +		uint64_t u64;
> >> +	} *pnum = (void *)pn;
> >> +
> >> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> +	int32_t v;
> >> +
> >> +	dp1 = vceqq_u16(dp1, dp2);
> >> +	dp1 = vandq_u16(dp1, mask);
> >> +	v = vaddvq_u16(dp1);
> >> +
> >> +	/* update last port counter. */hh
> >> +	lp[0] += gptbl[v].lpv;
> >> +	rte_compiler_barrier();
> >> +
> >> +	/* if dest port value has changed. */
> >> +	if (v != GRPMSK) {
> >> +		pnum->u64 = gptbl[v].pnum;
> >> +		pnum->u16[FWDSTEP] = 1;
> >> +		lp = pnum->u16 + gptbl[v].idx;
> >> +	}
> >> +
> >> +	return lp;
> >> +}
> 
> Thanks for the effort.
> As I can see this function: port_groupx4() is nearly identical for all 3
> platforms: sse/nenon/altivec (except of course built-in arch-specific instincts).
> In fact, even comemnts are identical.
> I wonder can we have something like:
> examples/common/<arch>/port_group.h
> and for each arch will have defined port_groupx4(...) ?
> 
Yes, It’s a good point. I was thinking to have arch in file name itself. But we can have arch specific directory and have different header files.
Do you want me to make changes for all 3 sse/neon/altivec or just neon ?
I can check compilation for all but functionality/perf validate for Neon only.

> >> +
> >> +#endif /* _NEON_COMMON_H_ */
> >> diff --git a/examples/common/pkt_group.h
> >> b/examples/common/pkt_group.h new file mode 100644 index
> >> 0000000000..8b26d9380f
> >> --- /dev/null
> >> +++ b/examples/common/pkt_group.h
> >> @@ -0,0 +1,139 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _PKT_GROUP_H_
> >> +#define _PKT_GROUP_H_
> >> +
> >> +#define FWDSTEP	4
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port into one burst.
> >> + * To avoid extra latency this is done together with some other
> >> +packet
> >> + * processing, but after we made a final decision about packet's destination.
> >> + * To do this we maintain:
> >> + * pnum - array of number of consecutive packets with the same dest
> >> +port for
> >> + * each packet in the input burst.
> >> + * lp - pointer to the last updated element in the pnum.
> >> + * dlp - dest port value lp corresponds to.
> >> + */
> >> +
> >> +#define	GRPSZ	(1 << FWDSTEP)
> >> +#define	GRPMSK	(GRPSZ - 1)
> >> +
> >> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> >> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> >> +		(lp)[0]++;                           \
> >> +	} else {                                     \
> >> +		(dlp) = (dcp)[idx];                  \
> >> +		(lp) = (pn) + (idx);                 \
> >> +		(lp)[0] = 1;                         \
> >> +	}                                            \
> >> +} while (0)
> >> +
> >> +static const struct {
> >> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> +	int32_t  idx;  /* index for new last updated elemnet. */
> >> +	uint16_t lpv;  /* add value to the last updated element. */ }
> >> +gptbl[GRPSZ] = {
> >> +	{
> >> +		/* 0: a != b, b != c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100010001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 1: a == b, b != c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100010002),
> >> +		.idx = 4,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 2: a != b, b == c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100020001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 3: a == b, b == c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100020003),
> >> +		.idx = 4,
> >> +		.lpv = 2,
> >> +	},
> >> +	{
> >> +		/* 4: a != b, b != c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200010001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 5: a == b, b != c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200010002),
> >> +		.idx = 4,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 6: a != b, b == c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200030001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 7: a == b, b == c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200030004),
> >> +		.idx = 4,
> >> +		.lpv = 3,
> >> +	},
> >> +	{
> >> +		/* 8: a != b, b != c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100010001),
> >> +		.idx = 3,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 9: a == b, b != c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100010002),
> >> +		.idx = 3,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 0xa: a != b, b == c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100020001),
> >> +		.idx = 3,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xb: a == b, b == c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100020003),
> >> +		.idx = 3,
> >> +		.lpv = 2,
> >> +	},
> >> +	{
> >> +		/* 0xc: a != b, b != c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300010001),
> >> +		.idx = 2,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xd: a == b, b != c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300010002),
> >> +		.idx = 2,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 0xe: a != b, b == c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300040001),
> >> +		.idx = 1,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xf: a == b, b == c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300040005),
> >> +		.idx = 0,
> >> +		.lpv = 4,
> >> +	},
> >> +};
> >> +
> >> +#endif /* _PKT_GROUP_H_ */
> >> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
> >> 8efe6378e2..8dbe85c2e6 100644
> >> --- a/examples/l3fwd/Makefile
> >> +++ b/examples/l3fwd/Makefile
> >> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
> >>   static: build/$(APP)-static
> >>   	ln -sf $(APP)-static build/$(APP)
> >>
> >> +INCLUDES =-I../common
> >>   PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS
> >> += -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
> 'rte_eth_link_to_str()'
> >> @@ -38,10 +39,10 @@ endif
> >>   endif
> >>
> >>   build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> >> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_SHARED)
> >>
> >>   build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> >> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_STATIC)
> >>
> >>   build:
> >>   	@mkdir -p $@
> >> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
> >> 8a52c90755..40b5f32a9e 100644
> >> --- a/examples/l3fwd/l3fwd.h
> >> +++ b/examples/l3fwd/l3fwd.h
> >> @@ -44,8 +44,6 @@
> >>   /* Used to mark destination port as 'invalid'. */
> >>   #define	BAD_PORT ((uint16_t)-1)
> >>
> >> -#define FWDSTEP	4
> >> -
> >>   /* replace first 12B of the ethernet header. */
> >>   #define	MASK_ETH 0x3f
> >>
> >> diff --git a/examples/l3fwd/l3fwd_common.h
> >> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
> >> --- a/examples/l3fwd/l3fwd_common.h
> >> +++ b/examples/l3fwd/l3fwd_common.h
> >> @@ -7,6 +7,8 @@
> >>   #ifndef _L3FWD_COMMON_H_
> >>   #define _L3FWD_COMMON_H_
> >>
> >> +#include "pkt_group.h"
> >> +
> >>   #ifdef DO_RFC_1812_CHECKS
> >>
> >>   #define	IPV4_MIN_VER_IHL	0x45
> >> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr,
> >> uint16_t *dp, uint32_t ptype)
> >>   #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
> >>   #endif /* DO_RFC_1812_CHECKS */
> >>
> >> -/*
> >> - * We group consecutive packets with the same destination port into one
> burst.
> >> - * To avoid extra latency this is done together with some other
> >> packet
> >> - * processing, but after we made a final decision about packet's destination.
> >> - * To do this we maintain:
> >> - * pnum - array of number of consecutive packets with the same dest
> >> port for
> >> - * each packet in the input burst.
> >> - * lp - pointer to the last updated element in the pnum.
> >> - * dlp - dest port value lp corresponds to.
> >> - */
> >> -
> >> -#define	GRPSZ	(1 << FWDSTEP)
> >> -#define	GRPMSK	(GRPSZ - 1)
> >> -
> >> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> >> -	if (likely((dlp) == (dcp)[(idx)])) {             \
> >> -		(lp)[0]++;                                   \
> >> -	} else {                                         \
> >> -		(dlp) = (dcp)[idx];                          \
> >> -		(lp) = (pn) + (idx);                         \
> >> -		(lp)[0] = 1;                                 \
> >> -	}                                                \
> >> -} while (0)
> >> -
> >> -static const struct {
> >> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> -	int32_t  idx;  /* index for new last updated element. */
> >> -	uint16_t lpv;  /* add value to the last updated element. */
> >> -} gptbl[GRPSZ] = {
> >> -	{
> >> -		/* 0: a != b, b != c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100010001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 1: a == b, b != c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100010002),
> >> -		.idx = 4,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 2: a != b, b == c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100020001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 3: a == b, b == c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100020003),
> >> -		.idx = 4,
> >> -		.lpv = 2,
> >> -	},
> >> -	{
> >> -		/* 4: a != b, b != c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200010001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 5: a == b, b != c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200010002),
> >> -		.idx = 4,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 6: a != b, b == c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200030001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 7: a == b, b == c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200030004),
> >> -		.idx = 4,
> >> -		.lpv = 3,
> >> -	},
> >> -	{
> >> -		/* 8: a != b, b != c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100010001),
> >> -		.idx = 3,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 9: a == b, b != c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100010002),
> >> -		.idx = 3,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 0xa: a != b, b == c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100020001),
> >> -		.idx = 3,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xb: a == b, b == c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100020003),
> >> -		.idx = 3,
> >> -		.lpv = 2,
> >> -	},
> >> -	{
> >> -		/* 0xc: a != b, b != c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300010001),
> >> -		.idx = 2,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xd: a == b, b != c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300010002),
> >> -		.idx = 2,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 0xe: a != b, b == c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300040001),
> >> -		.idx = 1,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xf: a == b, b == c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300040005),
> >> -		.idx = 0,
> >> -		.lpv = 4,
> >> -	},
> >> -};
> >> -
> >>   static __rte_always_inline void
> >>   send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf
> *m[],
> >>   		uint32_t num)
> >> diff --git a/examples/l3fwd/l3fwd_neon.h
> >> b/examples/l3fwd/l3fwd_neon.h index e3d33a5229..5fa765b640 100644
> >> --- a/examples/l3fwd/l3fwd_neon.h
> >> +++ b/examples/l3fwd/l3fwd_neon.h
> >> @@ -7,6 +7,7 @@
> >>   #define _L3FWD_NEON_H_
> >>
> >>   #include "l3fwd.h"
> >> +#include "neon_common.h"
> >>   #include "l3fwd_common.h"
> >>
> >>   /*
> >> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
> >> uint16_t dst_port[FWDSTEP])
> >>   			&dst_port[3], pkt[3]->packet_type);
> >>   }
> >>
> >> -/*
> >> - * Group consecutive packets with the same destination port in bursts of 4.
> >> - * Suppose we have array of destination ports:
> >> - * dst_port[] = {a, b, c, d,, e, ... }
> >> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> - * We doing 4 comparisons at once and the result is 4 bit mask.
> >> - * This mask is used as an index into prebuild array of pnum values.
> >> - */
> >> -static inline uint16_t *
> >> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> >> -	     uint16x8_t dp2)
> >> -{
> >> -	union {
> >> -		uint16_t u16[FWDSTEP + 1];
> >> -		uint64_t u64;
> >> -	} *pnum = (void *)pn;
> >> -
> >> -	int32_t v;
> >> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> -
> >> -	dp1 = vceqq_u16(dp1, dp2);
> >> -	dp1 = vandq_u16(dp1, mask);
> >> -	v = vaddvq_u16(dp1);
> >> -
> >> -	/* update last port counter. */
> >> -	lp[0] += gptbl[v].lpv;
> >> -	rte_compiler_barrier();
> >> -
> >> -	/* if dest port value has changed. */
> >> -	if (v != GRPMSK) {
> >> -		pnum->u64 = gptbl[v].pnum;
> >> -		pnum->u16[FWDSTEP] = 1;
> >> -		lp = pnum->u16 + gptbl[v].idx;
> >> -	}
> >> -
> >> -	return lp;
> >> -}
> >> -
> >>   /**
> >>    * Process one packet:
> >>    * Update source and destination MAC addresses in the ethernet header.
> >> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>   			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> >>   			 */
> >>   			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> >> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> >> dp2);
> >>
> >>   			/*
> >>   			 * dp1:
> >> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>   		 */
> >>   		dp2 = vextq_u16(dp1, dp1, 1);
> >>   		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> >> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >>
> >>   		/*
> >>   		 * remove values added by the last repeated diff --git
> >> a/examples/meson.build b/examples/meson.build index
> >> 78de0e1f37..81e93799f2 100644
> >> --- a/examples/meson.build
> >> +++ b/examples/meson.build
> >> @@ -97,7 +97,7 @@ foreach example: examples
> >>       ldflags = default_ldflags
> >>
> >>       ext_deps = []
> >> -    includes = [include_directories(example)]
> >> +    includes = [include_directories(example, 'common')]
> >>       deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
> >>       subdir(example)
> >>
> >> --
> >> 2.25.1
> >
  
Konstantin Ananyev June 22, 2022, 11:25 p.m. UTC | #7
21/06/2022 17:50, Rahul Bhansali пишет:
> 
> 
>> -----Original Message-----
>> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>> Sent: Tuesday, June 21, 2022 4:43 AM
>> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Ruifeng Wang
>> <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>> Subject: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group
>> functionality
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> 17/06/2022 08:50, Rahul Bhansali пишет:
>>> CC: Konstantin Ananyev
>>>
>>>> -----Original Message-----
>>>> From: Rahul Bhansali <rbhansali@marvell.com>
>>>> Sent: Friday, June 17, 2022 1:13 PM
>>>> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
>>>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
>>>> <rbhansali@marvell.com>
>>>> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group
>>>> functionality
>>>>
>>>> This will make the packet grouping function common, so that other
>>>> examples can utilize as per need.
>>>>
>>>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>>>> ---
>>>> Changes in v2: New patch to address review comment.
>>>>
>>>>    examples/common/neon_common.h |  50 ++++++++++++
>>>>    examples/common/pkt_group.h   | 139
>>>> ++++++++++++++++++++++++++++++++++
>>>>    examples/l3fwd/Makefile       |   5 +-
>>>>    examples/l3fwd/l3fwd.h        |   2 -
>>>>    examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>>>>    examples/l3fwd/l3fwd_neon.h   |  43 +----------
>>>>    examples/meson.build          |   2 +-
>>>>    7 files changed, 198 insertions(+), 172 deletions(-)  create mode
>>>> 100644 examples/common/neon_common.h  create mode 100644
>>>> examples/common/pkt_group.h
>>>>
>>>> diff --git a/examples/common/neon_common.h
>>>> b/examples/common/neon_common.h new file mode 100644 index
>>>> 0000000000..f01b5ab6bc
>>>> --- /dev/null
>>>> +++ b/examples/common/neon_common.h
>>>> @@ -0,0 +1,50 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2016-2018 Intel Corporation.
>>>> + * Copyright(c) 2017-2018 Linaro Limited.
>>>> + * Copyright(C) 2022 Marvell.
>>>> + */
>>>> +
>>>> +#ifndef _NEON_COMMON_H_
>>>> +#define _NEON_COMMON_H_
>>>> +
>>>> +#include "pkt_group.h"
>>>> +
>>>> +/*
>>>> + * Group consecutive packets with the same destination port in bursts of 4.
>>>> + * Suppose we have array of destination ports:
>>>> + * dst_port[] = {a, b, c, d,, e, ... }
>>>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>>>> + * This mask is used as an index into prebuild array of pnum values.
>>>> + */
>>>> +static inline uint16_t *
>>>> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t
>> dp1,
>>>> +		  uint16x8_t dp2)
>>>> +{
>>>> +	union {
>>>> +		uint16_t u16[FWDSTEP + 1];
>>>> +		uint64_t u64;
>>>> +	} *pnum = (void *)pn;
>>>> +
>>>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>>> +	int32_t v;
>>>> +
>>>> +	dp1 = vceqq_u16(dp1, dp2);
>>>> +	dp1 = vandq_u16(dp1, mask);
>>>> +	v = vaddvq_u16(dp1);
>>>> +
>>>> +	/* update last port counter. */hh
>>>> +	lp[0] += gptbl[v].lpv;
>>>> +	rte_compiler_barrier();
>>>> +
>>>> +	/* if dest port value has changed. */
>>>> +	if (v != GRPMSK) {
>>>> +		pnum->u64 = gptbl[v].pnum;
>>>> +		pnum->u16[FWDSTEP] = 1;
>>>> +		lp = pnum->u16 + gptbl[v].idx;
>>>> +	}
>>>> +
>>>> +	return lp;
>>>> +}
>>
>> Thanks for the effort.
>> As I can see this function: port_groupx4() is nearly identical for all 3
>> platforms: sse/nenon/altivec (except of course built-in arch-specific instincts).
>> In fact, even comemnts are identical.
>> I wonder can we have something like:
>> examples/common/<arch>/port_group.h
>> and for each arch will have defined port_groupx4(...) ?
>>
> Yes, It’s a good point. I was thinking to have arch in file name itself. But we can have arch specific directory and have different header files.
> Do you want me to make changes for all 3 sse/neon/altivec or just neon ?

My thought was to move headers for all archs.

> I can check compilation for all but functionality/perf validate for Neon only.

I can do quick functional test for x86.
Plus I think l3fwd is part of release cycle testing anyway.
Thanks
Konstantin

>>>> +
>>>> +#endif /* _NEON_COMMON_H_ */
>>>> diff --git a/examples/common/pkt_group.h
>>>> b/examples/common/pkt_group.h new file mode 100644 index
>>>> 0000000000..8b26d9380f
>>>> --- /dev/null
>>>> +++ b/examples/common/pkt_group.h
>>>> @@ -0,0 +1,139 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2016-2018 Intel Corporation.
>>>> + * Copyright(c) 2017-2018 Linaro Limited.
>>>> + * Copyright(C) 2022 Marvell.
>>>> + */
>>>> +
>>>> +#ifndef _PKT_GROUP_H_
>>>> +#define _PKT_GROUP_H_
>>>> +
>>>> +#define FWDSTEP	4
>>>> +
>>>> +/*
>>>> + * Group consecutive packets with the same destination port into one burst.
>>>> + * To avoid extra latency this is done together with some other
>>>> +packet
>>>> + * processing, but after we made a final decision about packet's destination.
>>>> + * To do this we maintain:
>>>> + * pnum - array of number of consecutive packets with the same dest
>>>> +port for
>>>> + * each packet in the input burst.
>>>> + * lp - pointer to the last updated element in the pnum.
>>>> + * dlp - dest port value lp corresponds to.
>>>> + */
>>>> +
>>>> +#define	GRPSZ	(1 << FWDSTEP)
>>>> +#define	GRPMSK	(GRPSZ - 1)
>>>> +
>>>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>>>> +		(lp)[0]++;                           \
>>>> +	} else {                                     \
>>>> +		(dlp) = (dcp)[idx];                  \
>>>> +		(lp) = (pn) + (idx);                 \
>>>> +		(lp)[0] = 1;                         \
>>>> +	}                                            \
>>>> +} while (0)
>>>> +
>>>> +static const struct {
>>>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>>> +	int32_t  idx;  /* index for new last updated elemnet. */
>>>> +	uint16_t lpv;  /* add value to the last updated element. */ }
>>>> +gptbl[GRPSZ] = {
>>>> +	{
>>>> +		/* 0: a != b, b != c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100010001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 1: a == b, b != c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100010002),
>>>> +		.idx = 4,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 2: a != b, b == c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100020001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 3: a == b, b == c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100020003),
>>>> +		.idx = 4,
>>>> +		.lpv = 2,
>>>> +	},
>>>> +	{
>>>> +		/* 4: a != b, b != c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200010001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 5: a == b, b != c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200010002),
>>>> +		.idx = 4,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 6: a != b, b == c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200030001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 7: a == b, b == c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200030004),
>>>> +		.idx = 4,
>>>> +		.lpv = 3,
>>>> +	},
>>>> +	{
>>>> +		/* 8: a != b, b != c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100010001),
>>>> +		.idx = 3,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 9: a == b, b != c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100010002),
>>>> +		.idx = 3,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 0xa: a != b, b == c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100020001),
>>>> +		.idx = 3,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xb: a == b, b == c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100020003),
>>>> +		.idx = 3,
>>>> +		.lpv = 2,
>>>> +	},
>>>> +	{
>>>> +		/* 0xc: a != b, b != c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300010001),
>>>> +		.idx = 2,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xd: a == b, b != c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300010002),
>>>> +		.idx = 2,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 0xe: a != b, b == c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300040001),
>>>> +		.idx = 1,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xf: a == b, b == c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300040005),
>>>> +		.idx = 0,
>>>> +		.lpv = 4,
>>>> +	},
>>>> +};
>>>> +
>>>> +#endif /* _PKT_GROUP_H_ */
>>>> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
>>>> 8efe6378e2..8dbe85c2e6 100644
>>>> --- a/examples/l3fwd/Makefile
>>>> +++ b/examples/l3fwd/Makefile
>>>> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>>>>    static: build/$(APP)-static
>>>>    	ln -sf $(APP)-static build/$(APP)
>>>>
>>>> +INCLUDES =-I../common
>>>>    PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS
>>>> += -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
>> 'rte_eth_link_to_str()'
>>>> @@ -38,10 +39,10 @@ endif
>>>>    endif
>>>>
>>>>    build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
>>>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
>>>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>>>> +$(LDFLAGS_SHARED)
>>>>
>>>>    build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
>>>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
>>>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>>>> +$(LDFLAGS_STATIC)
>>>>
>>>>    build:
>>>>    	@mkdir -p $@
>>>> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
>>>> 8a52c90755..40b5f32a9e 100644
>>>> --- a/examples/l3fwd/l3fwd.h
>>>> +++ b/examples/l3fwd/l3fwd.h
>>>> @@ -44,8 +44,6 @@
>>>>    /* Used to mark destination port as 'invalid'. */
>>>>    #define	BAD_PORT ((uint16_t)-1)
>>>>
>>>> -#define FWDSTEP	4
>>>> -
>>>>    /* replace first 12B of the ethernet header. */
>>>>    #define	MASK_ETH 0x3f
>>>>
>>>> diff --git a/examples/l3fwd/l3fwd_common.h
>>>> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
>>>> --- a/examples/l3fwd/l3fwd_common.h
>>>> +++ b/examples/l3fwd/l3fwd_common.h
>>>> @@ -7,6 +7,8 @@
>>>>    #ifndef _L3FWD_COMMON_H_
>>>>    #define _L3FWD_COMMON_H_
>>>>
>>>> +#include "pkt_group.h"
>>>> +
>>>>    #ifdef DO_RFC_1812_CHECKS
>>>>
>>>>    #define	IPV4_MIN_VER_IHL	0x45
>>>> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr,
>>>> uint16_t *dp, uint32_t ptype)
>>>>    #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>>>>    #endif /* DO_RFC_1812_CHECKS */
>>>>
>>>> -/*
>>>> - * We group consecutive packets with the same destination port into one
>> burst.
>>>> - * To avoid extra latency this is done together with some other
>>>> packet
>>>> - * processing, but after we made a final decision about packet's destination.
>>>> - * To do this we maintain:
>>>> - * pnum - array of number of consecutive packets with the same dest
>>>> port for
>>>> - * each packet in the input burst.
>>>> - * lp - pointer to the last updated element in the pnum.
>>>> - * dlp - dest port value lp corresponds to.
>>>> - */
>>>> -
>>>> -#define	GRPSZ	(1 << FWDSTEP)
>>>> -#define	GRPMSK	(GRPSZ - 1)
>>>> -
>>>> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>>> -	if (likely((dlp) == (dcp)[(idx)])) {             \
>>>> -		(lp)[0]++;                                   \
>>>> -	} else {                                         \
>>>> -		(dlp) = (dcp)[idx];                          \
>>>> -		(lp) = (pn) + (idx);                         \
>>>> -		(lp)[0] = 1;                                 \
>>>> -	}                                                \
>>>> -} while (0)
>>>> -
>>>> -static const struct {
>>>> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>>> -	int32_t  idx;  /* index for new last updated element. */
>>>> -	uint16_t lpv;  /* add value to the last updated element. */
>>>> -} gptbl[GRPSZ] = {
>>>> -	{
>>>> -		/* 0: a != b, b != c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100010001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 1: a == b, b != c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100010002),
>>>> -		.idx = 4,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 2: a != b, b == c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100020001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 3: a == b, b == c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100020003),
>>>> -		.idx = 4,
>>>> -		.lpv = 2,
>>>> -	},
>>>> -	{
>>>> -		/* 4: a != b, b != c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200010001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 5: a == b, b != c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200010002),
>>>> -		.idx = 4,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 6: a != b, b == c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200030001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 7: a == b, b == c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200030004),
>>>> -		.idx = 4,
>>>> -		.lpv = 3,
>>>> -	},
>>>> -	{
>>>> -		/* 8: a != b, b != c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100010001),
>>>> -		.idx = 3,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 9: a == b, b != c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100010002),
>>>> -		.idx = 3,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 0xa: a != b, b == c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100020001),
>>>> -		.idx = 3,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xb: a == b, b == c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100020003),
>>>> -		.idx = 3,
>>>> -		.lpv = 2,
>>>> -	},
>>>> -	{
>>>> -		/* 0xc: a != b, b != c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300010001),
>>>> -		.idx = 2,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xd: a == b, b != c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300010002),
>>>> -		.idx = 2,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 0xe: a != b, b == c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300040001),
>>>> -		.idx = 1,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xf: a == b, b == c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300040005),
>>>> -		.idx = 0,
>>>> -		.lpv = 4,
>>>> -	},
>>>> -};
>>>> -
>>>>    static __rte_always_inline void
>>>>    send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf
>> *m[],
>>>>    		uint32_t num)
>>>> diff --git a/examples/l3fwd/l3fwd_neon.h
>>>> b/examples/l3fwd/l3fwd_neon.h index e3d33a5229..5fa765b640 100644
>>>> --- a/examples/l3fwd/l3fwd_neon.h
>>>> +++ b/examples/l3fwd/l3fwd_neon.h
>>>> @@ -7,6 +7,7 @@
>>>>    #define _L3FWD_NEON_H_
>>>>
>>>>    #include "l3fwd.h"
>>>> +#include "neon_common.h"
>>>>    #include "l3fwd_common.h"
>>>>
>>>>    /*
>>>> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
>>>> uint16_t dst_port[FWDSTEP])
>>>>    			&dst_port[3], pkt[3]->packet_type);
>>>>    }
>>>>
>>>> -/*
>>>> - * Group consecutive packets with the same destination port in bursts of 4.
>>>> - * Suppose we have array of destination ports:
>>>> - * dst_port[] = {a, b, c, d,, e, ... }
>>>> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>>> - * We doing 4 comparisons at once and the result is 4 bit mask.
>>>> - * This mask is used as an index into prebuild array of pnum values.
>>>> - */
>>>> -static inline uint16_t *
>>>> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>>>> -	     uint16x8_t dp2)
>>>> -{
>>>> -	union {
>>>> -		uint16_t u16[FWDSTEP + 1];
>>>> -		uint64_t u64;
>>>> -	} *pnum = (void *)pn;
>>>> -
>>>> -	int32_t v;
>>>> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>>> -
>>>> -	dp1 = vceqq_u16(dp1, dp2);
>>>> -	dp1 = vandq_u16(dp1, mask);
>>>> -	v = vaddvq_u16(dp1);
>>>> -
>>>> -	/* update last port counter. */
>>>> -	lp[0] += gptbl[v].lpv;
>>>> -	rte_compiler_barrier();
>>>> -
>>>> -	/* if dest port value has changed. */
>>>> -	if (v != GRPMSK) {
>>>> -		pnum->u64 = gptbl[v].pnum;
>>>> -		pnum->u16[FWDSTEP] = 1;
>>>> -		lp = pnum->u16 + gptbl[v].idx;
>>>> -	}
>>>> -
>>>> -	return lp;
>>>> -}
>>>> -
>>>>    /**
>>>>     * Process one packet:
>>>>     * Update source and destination MAC addresses in the ethernet header.
>>>> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf,
>>>> struct rte_mbuf **pkts_burst,
>>>>    			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>>>    			 */
>>>>    			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
>>>> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
>>>> dp2);
>>>>
>>>>    			/*
>>>>    			 * dp1:
>>>> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf,
>>>> struct rte_mbuf **pkts_burst,
>>>>    		 */
>>>>    		dp2 = vextq_u16(dp1, dp1, 1);
>>>>    		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>>>> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>>
>>>>    		/*
>>>>    		 * remove values added by the last repeated diff --git
>>>> a/examples/meson.build b/examples/meson.build index
>>>> 78de0e1f37..81e93799f2 100644
>>>> --- a/examples/meson.build
>>>> +++ b/examples/meson.build
>>>> @@ -97,7 +97,7 @@ foreach example: examples
>>>>        ldflags = default_ldflags
>>>>
>>>>        ext_deps = []
>>>> -    includes = [include_directories(example)]
>>>> +    includes = [include_directories(example, 'common')]
>>>>        deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>>>>        subdir(example)
>>>>
>>>> --
>>>> 2.25.1
>>>
>
  

Patch

diff --git a/examples/common/neon_common.h b/examples/common/neon_common.h
new file mode 100644
index 0000000000..f01b5ab6bc
--- /dev/null
+++ b/examples/common/neon_common.h
@@ -0,0 +1,50 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _NEON_COMMON_H_
+#define _NEON_COMMON_H_
+
+#include "pkt_group.h"
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+		  uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+	int32_t v;
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+#endif /* _NEON_COMMON_H_ */
diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
new file mode 100644
index 0000000000..8b26d9380f
--- /dev/null
+++ b/examples/common/pkt_group.h
@@ -0,0 +1,139 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PKT_GROUP_H_
+#define _PKT_GROUP_H_
+
+#define FWDSTEP	4
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {         \
+		(lp)[0]++;                           \
+	} else {                                     \
+		(dlp) = (dcp)[idx];                  \
+		(lp) = (pn) + (idx);                 \
+		(lp)[0] = 1;                         \
+	}                                            \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+#endif /* _PKT_GROUP_H_ */
diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile
index 8efe6378e2..8dbe85c2e6 100644
--- a/examples/l3fwd/Makefile
+++ b/examples/l3fwd/Makefile
@@ -22,6 +22,7 @@  shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 # Added for 'rte_eth_link_to_str()'
@@ -38,10 +39,10 @@  endif
 endif

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 8a52c90755..40b5f32a9e 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -44,8 +44,6 @@ 
 /* Used to mark destination port as 'invalid'. */
 #define	BAD_PORT ((uint16_t)-1)

-#define FWDSTEP	4
-
 /* replace first 12B of the ethernet header. */
 #define	MASK_ETH 0x3f

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
index 8e4c27218f..224b1c08e8 100644
--- a/examples/l3fwd/l3fwd_common.h
+++ b/examples/l3fwd/l3fwd_common.h
@@ -7,6 +7,8 @@ 
 #ifndef _L3FWD_COMMON_H_
 #define _L3FWD_COMMON_H_

+#include "pkt_group.h"
+
 #ifdef DO_RFC_1812_CHECKS

 #define	IPV4_MIN_VER_IHL	0x45
@@ -50,133 +52,6 @@  rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
 #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
 #endif /* DO_RFC_1812_CHECKS */

-/*
- * We group consecutive packets with the same destination port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-static const struct {
-	uint64_t pnum; /* prebuild 4 values for pnum[]. */
-	int32_t  idx;  /* index for new last updated element. */
-	uint16_t lpv;  /* add value to the last updated element. */
-} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-};
-
 static __rte_always_inline void
 send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
 		uint32_t num)
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index e3d33a5229..5fa765b640 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -7,6 +7,7 @@ 
 #define _L3FWD_NEON_H_

 #include "l3fwd.h"
+#include "neon_common.h"
 #include "l3fwd_common.h"

 /*
@@ -62,44 +63,6 @@  processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
 			&dst_port[3], pkt[3]->packet_type);
 }

-/*
- * Group consecutive packets with the same destination port in bursts of 4.
- * Suppose we have array of destination ports:
- * dst_port[] = {a, b, c, d,, e, ... }
- * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
- * We doing 4 comparisons at once and the result is 4 bit mask.
- * This mask is used as an index into prebuild array of pnum values.
- */
-static inline uint16_t *
-port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
-	     uint16x8_t dp2)
-{
-	union {
-		uint16_t u16[FWDSTEP + 1];
-		uint64_t u64;
-	} *pnum = (void *)pn;
-
-	int32_t v;
-	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
-
-	dp1 = vceqq_u16(dp1, dp2);
-	dp1 = vandq_u16(dp1, mask);
-	v = vaddvq_u16(dp1);
-
-	/* update last port counter. */
-	lp[0] += gptbl[v].lpv;
-	rte_compiler_barrier();
-
-	/* if dest port value has changed. */
-	if (v != GRPMSK) {
-		pnum->u64 = gptbl[v].pnum;
-		pnum->u16[FWDSTEP] = 1;
-		lp = pnum->u16 + gptbl[v].idx;
-	}
-
-	return lp;
-}
-
 /**
  * Process one packet:
  * Update source and destination MAC addresses in the ethernet header.
@@ -161,7 +124,7 @@  send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
 			 */
 			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
-			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);

 			/*
 			 * dp1:
@@ -175,7 +138,7 @@  send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 		 */
 		dp2 = vextq_u16(dp1, dp1, 1);
 		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
-		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);

 		/*
 		 * remove values added by the last repeated
diff --git a/examples/meson.build b/examples/meson.build
index 78de0e1f37..81e93799f2 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -97,7 +97,7 @@  foreach example: examples
     ldflags = default_ldflags

     ext_deps = []
-    includes = [include_directories(example)]
+    includes = [include_directories(example, 'common')]
     deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
     subdir(example)