[v1,20/26] node: ipv4 lookup for x86

Message ID 20200318213551.3489504-21-jerinj@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series graph: introduce graph subsystem |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Jerin Jacob Kollanukkaran March 18, 2020, 9:35 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add IPv4 lookup process function for ip4_lookup
rte_node. This node performs LPM lookup using x86_64
vector supported RTE_LPM API on every packet received
and forwards it to a next node that is identified by
lookup result.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_node/ip4_lookup.c | 245 +++++++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)
  

Comments

Ray Kinsella March 19, 2020, 12:25 p.m. UTC | #1
On 18/03/2020 21:35, jerinj@marvell.com wrote:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Add IPv4 lookup process function for ip4_lookup
> rte_node. This node performs LPM lookup using x86_64
> vector supported RTE_LPM API on every packet received
> and forwards it to a next node that is identified by
> lookup result.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>  lib/librte_node/ip4_lookup.c | 245 +++++++++++++++++++++++++++++++++++
>  1 file changed, 245 insertions(+)
> 
> diff --git a/lib/librte_node/ip4_lookup.c b/lib/librte_node/ip4_lookup.c
> index d7fcd1158..c003e9c91 100644
> --- a/lib/librte_node/ip4_lookup.c
> +++ b/lib/librte_node/ip4_lookup.c
> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct rte_graph *graph, struct rte_node *node,
>  	return nb_objs;
>  }
>  
> +#elif defined(RTE_ARCH_X86)
> +
> +/* X86 SSE */
> +static uint16_t
> +ip4_lookup_node_process(struct rte_graph *graph, struct rte_node *node,
> +			void **objs, uint16_t nb_objs)
> +{
> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
> +	rte_edge_t next0, next1, next2, next3, next_index;
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	struct rte_ether_hdr *eth_hdr;
> +	uint32_t ip0, ip1, ip2, ip3;
> +	void **to_next, **from;
> +	uint16_t last_spec = 0;
> +	uint16_t n_left_from;
> +	struct rte_lpm *lpm;
> +	uint16_t held = 0;
> +	uint32_t drop_nh;
> +	rte_xmm_t dst;
> +	__m128i dip; /* SSE register */
> +	int rc, i;
> +
> +	/* Speculative next */
> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
> +	/* Drop node */
> +	drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
> +
> +	/* Get socket specific LPM from ctx */
> +	lpm = *((struct rte_lpm **)node->ctx);
> +
> +	pkts = (struct rte_mbuf **)objs;
> +	from = objs;
> +	n_left_from = nb_objs;

I doubt this initial prefetch of the first 4 packets has any benefit. 

> +	if (n_left_from >= 4) {
> +		for (i = 0; i < 4; i++) {
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
> +						       struct rte_ether_hdr *) +
> +				      1);
> +		}
> +	}
> +
> +	/* Get stream for the speculated next node */
> +	to_next = rte_node_next_stream_get(graph, node, next_index, nb_objs);

Suggest you don't reuse the hand-unrolling optimization from FD.io VPP. 
I have never found any performance benefit from them, and they make the code unnecessarily verbose. 


> +	while (n_left_from >= 4) {
> +		/* Prefetch next-next mbufs */
> +		if (likely(n_left_from >= 11)) {
> +			rte_prefetch0(pkts[8]);
> +			rte_prefetch0(pkts[9]);
> +			rte_prefetch0(pkts[10]);
> +			rte_prefetch0(pkts[11]);
> +		}
> +
> +		/* Prefetch next mbuf data */
> +		if (likely(n_left_from >= 7)) {
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
> +						       struct rte_ether_hdr *) +
> +				      1);
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
> +						       struct rte_ether_hdr *) +
> +				      1);
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
> +						       struct rte_ether_hdr *) +
> +				      1);
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
> +						       struct rte_ether_hdr *) +
> +				      1);
> +		}
> +
> +		mbuf0 = pkts[0];
> +		mbuf1 = pkts[1];
> +		mbuf2 = pkts[2];
> +		mbuf3 = pkts[3];
> +
> +		pkts += 4;
> +		n_left_from -= 4;
> +
> +		/* Extract DIP of mbuf0 */
> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct rte_ether_hdr *);
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip0 = ipv4_hdr->dst_addr;
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr->hdr_checksum;
> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr->time_to_live;
> +
> +		/* Extract DIP of mbuf1 */
> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct rte_ether_hdr *);
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip1 = ipv4_hdr->dst_addr;
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr->hdr_checksum;
> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr->time_to_live;
> +
> +		/* Extract DIP of mbuf2 */
> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct rte_ether_hdr *);
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip2 = ipv4_hdr->dst_addr;
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr->hdr_checksum;
> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr->time_to_live;
> +
> +		/* Extract DIP of mbuf3 */
> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct rte_ether_hdr *);
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip3 = ipv4_hdr->dst_addr;
> +
> +		/* Prepare for lookup x4 */
> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
> +
> +		/* Byte swap 4 IPV4 addresses. */
> +		const __m128i bswap_mask = _mm_set_epi8(
> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
> +
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr->hdr_checksum;
> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr->time_to_live;
> +
> +		/* Perform LPM lookup to get NH and next node */
> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
> +
> +		/* Extract next node id and NH */
> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] & 0xFFFF;
> +		next0 = (dst.u32[0] >> 16);
> +
> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] & 0xFFFF;
> +		next1 = (dst.u32[1] >> 16);
> +
> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] & 0xFFFF;
> +		next2 = (dst.u32[2] >> 16);
> +
> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] & 0xFFFF;
> +		next3 = (dst.u32[3] >> 16);
> +
> +		/* Enqueue four to next node */
> +		rte_edge_t fix_spec =
> +			(next_index ^ next0) | (next_index ^ next1) |
> +			(next_index ^ next2) | (next_index ^ next3);
> +
> +		if (unlikely(fix_spec)) {
> +			/* Copy things successfully speculated till now */
> +			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +			from += last_spec;
> +			to_next += last_spec;
> +			held += last_spec;
> +			last_spec = 0;
> +
> +			/* Next0 */
> +			if (next_index == next0) {
> +				to_next[0] = from[0];
> +				to_next++;
> +				held++;
> +			} else {
> +				rte_node_enqueue_x1(graph, node, next0,
> +						    from[0]);
> +			}
> +
> +			/* Next1 */
> +			if (next_index == next1) {
> +				to_next[0] = from[1];
> +				to_next++;
> +				held++;
> +			} else {
> +				rte_node_enqueue_x1(graph, node, next1,
> +						    from[1]);
> +			}
> +
> +			/* Next2 */
> +			if (next_index == next2) {
> +				to_next[0] = from[2];
> +				to_next++;
> +				held++;
> +			} else {
> +				rte_node_enqueue_x1(graph, node, next2,
> +						    from[2]);
> +			}
> +
> +			/* Next3 */
> +			if (next_index == next3) {
> +				to_next[0] = from[3];
> +				to_next++;
> +				held++;
> +			} else {
> +				rte_node_enqueue_x1(graph, node, next3,
> +						    from[3]);
> +			}
> +
> +			from += 4;
> +
> +		} else {
> +			last_spec += 4;
> +		}
> +	}
> +
> +	while (n_left_from > 0) {
> +		uint32_t next_hop;
> +
> +		mbuf0 = pkts[0];
> +
> +		pkts += 1;
> +		n_left_from -= 1;
> +
> +		/* Extract DIP of mbuf0 */
> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct rte_ether_hdr *);
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr->hdr_checksum;
> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr->time_to_live;
> +
> +		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr->dst_addr),
> +				    &next_hop);
> +		next_hop = (rc == 0) ? next_hop : drop_nh;
> +
> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop & 0xFFFF;
> +		next0 = (next_hop >> 16);
> +
> +		if (unlikely(next_index ^ next0)) {
> +			/* Copy things successfully speculated till now */
> +			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +			from += last_spec;
> +			to_next += last_spec;
> +			held += last_spec;
> +			last_spec = 0;
> +
> +			rte_node_enqueue_x1(graph, node, next0, from[0]);
> +			from += 1;
> +		} else {
> +			last_spec += 1;
> +		}
> +	}
> +
> +	/* !!! Home run !!! */
> +	if (likely(last_spec == nb_objs)) {
> +		rte_node_next_stream_move(graph, node, next_index);
> +		return nb_objs;
> +	}
> +
> +	held += last_spec;
> +	/* Copy things successfully speculated till now */
> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +	rte_node_next_stream_put(graph, node, next_index, held);
> +
> +	return nb_objs;
> +}
> +
>  #else
>  
>  static uint16_t
>
  
Pavan Nikhilesh Bhagavatula March 19, 2020, 2:22 p.m. UTC | #2
>On 18/03/2020 21:35, jerinj@marvell.com wrote:
>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>
>> Add IPv4 lookup process function for ip4_lookup
>> rte_node. This node performs LPM lookup using x86_64
>> vector supported RTE_LPM API on every packet received
>> and forwards it to a next node that is identified by
>> lookup result.
>>
>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>> ---
>>  lib/librte_node/ip4_lookup.c | 245
>+++++++++++++++++++++++++++++++++++
>>  1 file changed, 245 insertions(+)
>>
>> diff --git a/lib/librte_node/ip4_lookup.c
>b/lib/librte_node/ip4_lookup.c
>> index d7fcd1158..c003e9c91 100644
>> --- a/lib/librte_node/ip4_lookup.c
>> +++ b/lib/librte_node/ip4_lookup.c
>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct rte_graph
>*graph, struct rte_node *node,
>>  	return nb_objs;
>>  }
>>
>> +#elif defined(RTE_ARCH_X86)
>> +
>> +/* X86 SSE */
>> +static uint16_t
>> +ip4_lookup_node_process(struct rte_graph *graph, struct rte_node
>*node,
>> +			void **objs, uint16_t nb_objs)
>> +{
>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>> +	rte_edge_t next0, next1, next2, next3, next_index;
>> +	struct rte_ipv4_hdr *ipv4_hdr;
>> +	struct rte_ether_hdr *eth_hdr;
>> +	uint32_t ip0, ip1, ip2, ip3;
>> +	void **to_next, **from;
>> +	uint16_t last_spec = 0;
>> +	uint16_t n_left_from;
>> +	struct rte_lpm *lpm;
>> +	uint16_t held = 0;
>> +	uint32_t drop_nh;
>> +	rte_xmm_t dst;
>> +	__m128i dip; /* SSE register */
>> +	int rc, i;
>> +
>> +	/* Speculative next */
>> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>> +	/* Drop node */
>> +	drop_nh =
>((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>> +
>> +	/* Get socket specific LPM from ctx */
>> +	lpm = *((struct rte_lpm **)node->ctx);
>> +
>> +	pkts = (struct rte_mbuf **)objs;
>> +	from = objs;
>> +	n_left_from = nb_objs;
>
>I doubt this initial prefetch of the first 4 packets has any benefit.

Ack will remove in v2 for x86.

>
>> +	if (n_left_from >= 4) {
>> +		for (i = 0; i < 4; i++) {
>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>> +						       struct rte_ether_hdr
>*) +
>> +				      1);
>> +		}
>> +	}
>> +
>> +	/* Get stream for the speculated next node */
>> +	to_next = rte_node_next_stream_get(graph, node,
>next_index, nb_objs);
>
>Suggest you don't reuse the hand-unrolling optimization from FD.io
>VPP.
>I have never found any performance benefit from them, and they
>make the code unnecessarily verbose.
>

How would be take the benefit of rte_lpm_lookupx4 without unrolling the loop?.
Also, in future if we are using rte_rib and fib with a CPU supporting wider SIMD we might
need to unroll them further (AVX256 AND 512 currently rte_lpm_lookup uses only 128bit
since it is only uses SSE extension). 

>
>> +	while (n_left_from >= 4) {
>> +		/* Prefetch next-next mbufs */
>> +		if (likely(n_left_from >= 11)) {
>> +			rte_prefetch0(pkts[8]);
>> +			rte_prefetch0(pkts[9]);
>> +			rte_prefetch0(pkts[10]);
>> +			rte_prefetch0(pkts[11]);
>> +		}
>> +
>> +		/* Prefetch next mbuf data */
>> +		if (likely(n_left_from >= 7)) {
>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>> +						       struct rte_ether_hdr
>*) +
>> +				      1);
>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>> +						       struct rte_ether_hdr
>*) +
>> +				      1);
>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>> +						       struct rte_ether_hdr
>*) +
>> +				      1);
>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>> +						       struct rte_ether_hdr
>*) +
>> +				      1);
>> +		}
>> +
>> +		mbuf0 = pkts[0];
>> +		mbuf1 = pkts[1];
>> +		mbuf2 = pkts[2];
>> +		mbuf3 = pkts[3];
>> +
>> +		pkts += 4;
>> +		n_left_from -= 4;
>> +
>> +		/* Extract DIP of mbuf0 */
>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>rte_ether_hdr *);
>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>> +		ip0 = ipv4_hdr->dst_addr;
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>time_to_live;
>> +
>> +		/* Extract DIP of mbuf1 */
>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>rte_ether_hdr *);
>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>> +		ip1 = ipv4_hdr->dst_addr;
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>time_to_live;
>> +
>> +		/* Extract DIP of mbuf2 */
>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>rte_ether_hdr *);
>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>> +		ip2 = ipv4_hdr->dst_addr;
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>time_to_live;
>> +
>> +		/* Extract DIP of mbuf3 */
>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>rte_ether_hdr *);
>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>> +		ip3 = ipv4_hdr->dst_addr;
>> +
>> +		/* Prepare for lookup x4 */
>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>> +
>> +		/* Byte swap 4 IPV4 addresses. */
>> +		const __m128i bswap_mask = _mm_set_epi8(
>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>> +
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>time_to_live;
>> +
>> +		/* Perform LPM lookup to get NH and next node */
>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>> +
>> +		/* Extract next node id and NH */
>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] &
>0xFFFF;
>> +		next0 = (dst.u32[0] >> 16);
>> +
>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] &
>0xFFFF;
>> +		next1 = (dst.u32[1] >> 16);
>> +
>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] &
>0xFFFF;
>> +		next2 = (dst.u32[2] >> 16);
>> +
>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] &
>0xFFFF;
>> +		next3 = (dst.u32[3] >> 16);
>> +
>> +		/* Enqueue four to next node */
>> +		rte_edge_t fix_spec =
>> +			(next_index ^ next0) | (next_index ^ next1) |
>> +			(next_index ^ next2) | (next_index ^ next3);
>> +
>> +		if (unlikely(fix_spec)) {
>> +			/* Copy things successfully speculated till now
>*/
>> +			rte_memcpy(to_next, from, last_spec *
>sizeof(from[0]));
>> +			from += last_spec;
>> +			to_next += last_spec;
>> +			held += last_spec;
>> +			last_spec = 0;
>> +
>> +			/* Next0 */
>> +			if (next_index == next0) {
>> +				to_next[0] = from[0];
>> +				to_next++;
>> +				held++;
>> +			} else {
>> +				rte_node_enqueue_x1(graph, node,
>next0,
>> +						    from[0]);
>> +			}
>> +
>> +			/* Next1 */
>> +			if (next_index == next1) {
>> +				to_next[0] = from[1];
>> +				to_next++;
>> +				held++;
>> +			} else {
>> +				rte_node_enqueue_x1(graph, node,
>next1,
>> +						    from[1]);
>> +			}
>> +
>> +			/* Next2 */
>> +			if (next_index == next2) {
>> +				to_next[0] = from[2];
>> +				to_next++;
>> +				held++;
>> +			} else {
>> +				rte_node_enqueue_x1(graph, node,
>next2,
>> +						    from[2]);
>> +			}
>> +
>> +			/* Next3 */
>> +			if (next_index == next3) {
>> +				to_next[0] = from[3];
>> +				to_next++;
>> +				held++;
>> +			} else {
>> +				rte_node_enqueue_x1(graph, node,
>next3,
>> +						    from[3]);
>> +			}
>> +
>> +			from += 4;
>> +
>> +		} else {
>> +			last_spec += 4;
>> +		}
>> +	}
>> +
>> +	while (n_left_from > 0) {
>> +		uint32_t next_hop;
>> +
>> +		mbuf0 = pkts[0];
>> +
>> +		pkts += 1;
>> +		n_left_from -= 1;
>> +
>> +		/* Extract DIP of mbuf0 */
>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>rte_ether_hdr *);
>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>time_to_live;
>> +
>> +		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr-
>>dst_addr),
>> +				    &next_hop);
>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>> +
>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop &
>0xFFFF;
>> +		next0 = (next_hop >> 16);
>> +
>> +		if (unlikely(next_index ^ next0)) {
>> +			/* Copy things successfully speculated till now
>*/
>> +			rte_memcpy(to_next, from, last_spec *
>sizeof(from[0]));
>> +			from += last_spec;
>> +			to_next += last_spec;
>> +			held += last_spec;
>> +			last_spec = 0;
>> +
>> +			rte_node_enqueue_x1(graph, node, next0,
>from[0]);
>> +			from += 1;
>> +		} else {
>> +			last_spec += 1;
>> +		}
>> +	}
>> +
>> +	/* !!! Home run !!! */
>> +	if (likely(last_spec == nb_objs)) {
>> +		rte_node_next_stream_move(graph, node,
>next_index);
>> +		return nb_objs;
>> +	}
>> +
>> +	held += last_spec;
>> +	/* Copy things successfully speculated till now */
>> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>> +	rte_node_next_stream_put(graph, node, next_index, held);
>> +
>> +	return nb_objs;
>> +}
>> +
>>  #else
>>
>>  static uint16_t
>>
  
Ray Kinsella March 19, 2020, 3:50 p.m. UTC | #3
On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>
>>> Add IPv4 lookup process function for ip4_lookup
>>> rte_node. This node performs LPM lookup using x86_64
>>> vector supported RTE_LPM API on every packet received
>>> and forwards it to a next node that is identified by
>>> lookup result.
>>>
>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>> ---
>>>  lib/librte_node/ip4_lookup.c | 245
>> +++++++++++++++++++++++++++++++++++
>>>  1 file changed, 245 insertions(+)
>>>
>>> diff --git a/lib/librte_node/ip4_lookup.c
>> b/lib/librte_node/ip4_lookup.c
>>> index d7fcd1158..c003e9c91 100644
>>> --- a/lib/librte_node/ip4_lookup.c
>>> +++ b/lib/librte_node/ip4_lookup.c
>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct rte_graph
>> *graph, struct rte_node *node,
>>>  	return nb_objs;
>>>  }
>>>
>>> +#elif defined(RTE_ARCH_X86)
>>> +
>>> +/* X86 SSE */
>>> +static uint16_t
>>> +ip4_lookup_node_process(struct rte_graph *graph, struct rte_node
>> *node,
>>> +			void **objs, uint16_t nb_objs)
>>> +{
>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	uint32_t ip0, ip1, ip2, ip3;
>>> +	void **to_next, **from;
>>> +	uint16_t last_spec = 0;
>>> +	uint16_t n_left_from;
>>> +	struct rte_lpm *lpm;
>>> +	uint16_t held = 0;
>>> +	uint32_t drop_nh;
>>> +	rte_xmm_t dst;
>>> +	__m128i dip; /* SSE register */
>>> +	int rc, i;
>>> +
>>> +	/* Speculative next */
>>> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>> +	/* Drop node */
>>> +	drop_nh =
>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>> +
>>> +	/* Get socket specific LPM from ctx */
>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>> +
>>> +	pkts = (struct rte_mbuf **)objs;
>>> +	from = objs;
>>> +	n_left_from = nb_objs;
>>
>> I doubt this initial prefetch of the first 4 packets has any benefit.
> 
> Ack will remove in v2 for x86.
> 
>>
>>> +	if (n_left_from >= 4) {
>>> +		for (i = 0; i < 4; i++) {
>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>> +						       struct rte_ether_hdr
>> *) +
>>> +				      1);
>>> +		}
>>> +	}
>>> +
>>> +	/* Get stream for the speculated next node */
>>> +	to_next = rte_node_next_stream_get(graph, node,
>> next_index, nb_objs);
>>
>> Suggest you don't reuse the hand-unrolling optimization from FD.io
>> VPP.
>> I have never found any performance benefit from them, and they
>> make the code unnecessarily verbose.
>>
> 
> How would be take the benefit of rte_lpm_lookupx4 without unrolling the loop?.
> Also, in future if we are using rte_rib and fib with a CPU supporting wider SIMD we might
> need to unroll them further (AVX256 AND 512 currently rte_lpm_lookup uses only 128bit
> since it is only uses SSE extension). 

Let the compiler do it for you, but using a constant vector length.
for (int i=0; i < 4; ++i) { ... }

> 
>>
>>> +	while (n_left_from >= 4) {
>>> +		/* Prefetch next-next mbufs */
>>> +		if (likely(n_left_from >= 11)) {
>>> +			rte_prefetch0(pkts[8]);
>>> +			rte_prefetch0(pkts[9]);
>>> +			rte_prefetch0(pkts[10]);
>>> +			rte_prefetch0(pkts[11]);
>>> +		}
>>> +
>>> +		/* Prefetch next mbuf data */
>>> +		if (likely(n_left_from >= 7)) {
>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>> +						       struct rte_ether_hdr
>> *) +
>>> +				      1);
>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>> +						       struct rte_ether_hdr
>> *) +
>>> +				      1);
>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>> +						       struct rte_ether_hdr
>> *) +
>>> +				      1);
>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>> +						       struct rte_ether_hdr
>> *) +
>>> +				      1);
>>> +		}
>>> +
>>> +		mbuf0 = pkts[0];
>>> +		mbuf1 = pkts[1];
>>> +		mbuf2 = pkts[2];
>>> +		mbuf3 = pkts[3];
>>> +
>>> +		pkts += 4;
>>> +		n_left_from -= 4;
>>> +
>>> +		/* Extract DIP of mbuf0 */
>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>> rte_ether_hdr *);
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip0 = ipv4_hdr->dst_addr;
>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>> hdr_checksum;
>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>> time_to_live;
>>> +
>>> +		/* Extract DIP of mbuf1 */
>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>> rte_ether_hdr *);
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip1 = ipv4_hdr->dst_addr;
>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>> +		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr-
>>> hdr_checksum;
>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>> time_to_live;
>>> +
>>> +		/* Extract DIP of mbuf2 */
>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>> rte_ether_hdr *);
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip2 = ipv4_hdr->dst_addr;
>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>> +		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr-
>>> hdr_checksum;
>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>> time_to_live;
>>> +
>>> +		/* Extract DIP of mbuf3 */
>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>> rte_ether_hdr *);
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip3 = ipv4_hdr->dst_addr;
>>> +
>>> +		/* Prepare for lookup x4 */
>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>> +
>>> +		/* Byte swap 4 IPV4 addresses. */
>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>> +
>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>> +		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr-
>>> hdr_checksum;
>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>> time_to_live;
>>> +
>>> +		/* Perform LPM lookup to get NH and next node */
>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>> +
>>> +		/* Extract next node id and NH */
>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] &
>> 0xFFFF;
>>> +		next0 = (dst.u32[0] >> 16);
>>> +
>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] &
>> 0xFFFF;
>>> +		next1 = (dst.u32[1] >> 16);
>>> +
>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] &
>> 0xFFFF;
>>> +		next2 = (dst.u32[2] >> 16);
>>> +
>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] &
>> 0xFFFF;
>>> +		next3 = (dst.u32[3] >> 16);
>>> +
>>> +		/* Enqueue four to next node */
>>> +		rte_edge_t fix_spec =
>>> +			(next_index ^ next0) | (next_index ^ next1) |
>>> +			(next_index ^ next2) | (next_index ^ next3);
>>> +
>>> +		if (unlikely(fix_spec)) {
>>> +			/* Copy things successfully speculated till now
>> */
>>> +			rte_memcpy(to_next, from, last_spec *
>> sizeof(from[0]));
>>> +			from += last_spec;
>>> +			to_next += last_spec;
>>> +			held += last_spec;
>>> +			last_spec = 0;
>>> +
>>> +			/* Next0 */
>>> +			if (next_index == next0) {
>>> +				to_next[0] = from[0];
>>> +				to_next++;
>>> +				held++;
>>> +			} else {
>>> +				rte_node_enqueue_x1(graph, node,
>> next0,
>>> +						    from[0]);
>>> +			}
>>> +
>>> +			/* Next1 */
>>> +			if (next_index == next1) {
>>> +				to_next[0] = from[1];
>>> +				to_next++;
>>> +				held++;
>>> +			} else {
>>> +				rte_node_enqueue_x1(graph, node,
>> next1,
>>> +						    from[1]);
>>> +			}
>>> +
>>> +			/* Next2 */
>>> +			if (next_index == next2) {
>>> +				to_next[0] = from[2];
>>> +				to_next++;
>>> +				held++;
>>> +			} else {
>>> +				rte_node_enqueue_x1(graph, node,
>> next2,
>>> +						    from[2]);
>>> +			}
>>> +
>>> +			/* Next3 */
>>> +			if (next_index == next3) {
>>> +				to_next[0] = from[3];
>>> +				to_next++;
>>> +				held++;
>>> +			} else {
>>> +				rte_node_enqueue_x1(graph, node,
>> next3,
>>> +						    from[3]);
>>> +			}
>>> +
>>> +			from += 4;
>>> +
>>> +		} else {
>>> +			last_spec += 4;
>>> +		}
>>> +	}
>>> +
>>> +	while (n_left_from > 0) {
>>> +		uint32_t next_hop;
>>> +
>>> +		mbuf0 = pkts[0];
>>> +
>>> +		pkts += 1;
>>> +		n_left_from -= 1;
>>> +
>>> +		/* Extract DIP of mbuf0 */
>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>> rte_ether_hdr *);
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>> hdr_checksum;
>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>> time_to_live;
>>> +
>>> +		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr-
>>> dst_addr),
>>> +				    &next_hop);
>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>> +
>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop &
>> 0xFFFF;
>>> +		next0 = (next_hop >> 16);
>>> +
>>> +		if (unlikely(next_index ^ next0)) {
>>> +			/* Copy things successfully speculated till now
>> */
>>> +			rte_memcpy(to_next, from, last_spec *
>> sizeof(from[0]));
>>> +			from += last_spec;
>>> +			to_next += last_spec;
>>> +			held += last_spec;
>>> +			last_spec = 0;
>>> +
>>> +			rte_node_enqueue_x1(graph, node, next0,
>> from[0]);
>>> +			from += 1;
>>> +		} else {
>>> +			last_spec += 1;
>>> +		}
>>> +	}
>>> +
>>> +	/* !!! Home run !!! */
>>> +	if (likely(last_spec == nb_objs)) {
>>> +		rte_node_next_stream_move(graph, node,
>> next_index);
>>> +		return nb_objs;
>>> +	}
>>> +
>>> +	held += last_spec;
>>> +	/* Copy things successfully speculated till now */
>>> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>>> +	rte_node_next_stream_put(graph, node, next_index, held);
>>> +
>>> +	return nb_objs;
>>> +}
>>> +
>>>  #else
>>>
>>>  static uint16_t
>>>
  
Pavan Nikhilesh Bhagavatula March 19, 2020, 4:13 p.m. UTC | #4
>-----Original Message-----
>From: Ray Kinsella <mdr@ashroe.eu>
>Sent: Thursday, March 19, 2020 9:21 PM
>To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
><ndabilpuram@marvell.com>
>Cc: dev@dpdk.org; thomas@monjalon.net;
>david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>Kumar Kokkilagadda <kirankumark@marvell.com>
>Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>for x86
>
>
>
>On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>
>>>> Add IPv4 lookup process function for ip4_lookup
>>>> rte_node. This node performs LPM lookup using x86_64
>>>> vector supported RTE_LPM API on every packet received
>>>> and forwards it to a next node that is identified by
>>>> lookup result.
>>>>
>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>> ---
>>>>  lib/librte_node/ip4_lookup.c | 245
>>> +++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 245 insertions(+)
>>>>
>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>> b/lib/librte_node/ip4_lookup.c
>>>> index d7fcd1158..c003e9c91 100644
>>>> --- a/lib/librte_node/ip4_lookup.c
>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>rte_graph
>>> *graph, struct rte_node *node,
>>>>  	return nb_objs;
>>>>  }
>>>>
>>>> +#elif defined(RTE_ARCH_X86)
>>>> +
>>>> +/* X86 SSE */
>>>> +static uint16_t
>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>rte_node
>>> *node,
>>>> +			void **objs, uint16_t nb_objs)
>>>> +{
>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>> +	struct rte_ether_hdr *eth_hdr;
>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>> +	void **to_next, **from;
>>>> +	uint16_t last_spec = 0;
>>>> +	uint16_t n_left_from;
>>>> +	struct rte_lpm *lpm;
>>>> +	uint16_t held = 0;
>>>> +	uint32_t drop_nh;
>>>> +	rte_xmm_t dst;
>>>> +	__m128i dip; /* SSE register */
>>>> +	int rc, i;
>>>> +
>>>> +	/* Speculative next */
>>>> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>> +	/* Drop node */
>>>> +	drop_nh =
>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>> +
>>>> +	/* Get socket specific LPM from ctx */
>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>> +
>>>> +	pkts = (struct rte_mbuf **)objs;
>>>> +	from = objs;
>>>> +	n_left_from = nb_objs;
>>>
>>> I doubt this initial prefetch of the first 4 packets has any benefit.
>>
>> Ack will remove in v2 for x86.
>>
>>>
>>>> +	if (n_left_from >= 4) {
>>>> +		for (i = 0; i < 4; i++) {
>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>> +						       struct rte_ether_hdr
>>> *) +
>>>> +				      1);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	/* Get stream for the speculated next node */
>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>> next_index, nb_objs);
>>>
>>> Suggest you don't reuse the hand-unrolling optimization from FD.io
>>> VPP.
>>> I have never found any performance benefit from them, and they
>>> make the code unnecessarily verbose.
>>>
>>
>> How would be take the benefit of rte_lpm_lookupx4 without
>unrolling the loop?.
>> Also, in future if we are using rte_rib and fib with a CPU supporting
>wider SIMD we might
>> need to unroll them further (AVX256 AND 512 currently
>rte_lpm_lookup uses only 128bit
>> since it is only uses SSE extension).
>
>Let the compiler do it for you, but using a constant vector length.
>for (int i=0; i < 4; ++i) { ... }
>

Ok, I think I misunderstood the previous comment. 
It was only for the prefetches in the loop right?

>>
>>>
>>>> +	while (n_left_from >= 4) {
>>>> +		/* Prefetch next-next mbufs */
>>>> +		if (likely(n_left_from >= 11)) {
>>>> +			rte_prefetch0(pkts[8]);
>>>> +			rte_prefetch0(pkts[9]);
>>>> +			rte_prefetch0(pkts[10]);
>>>> +			rte_prefetch0(pkts[11]);
>>>> +		}
>>>> +
>>>> +		/* Prefetch next mbuf data */
>>>> +		if (likely(n_left_from >= 7)) {
>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>> +						       struct rte_ether_hdr
>>> *) +
>>>> +				      1);
>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>> +						       struct rte_ether_hdr
>>> *) +
>>>> +				      1);
>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>> +						       struct rte_ether_hdr
>>> *) +
>>>> +				      1);
>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>> +						       struct rte_ether_hdr
>>> *) +
>>>> +				      1);
>>>> +		}
>>>> +
>>>> +		mbuf0 = pkts[0];
>>>> +		mbuf1 = pkts[1];
>>>> +		mbuf2 = pkts[2];
>>>> +		mbuf3 = pkts[3];
>>>> +
>>>> +		pkts += 4;
>>>> +		n_left_from -= 4;
>>>> +
>>>> +		/* Extract DIP of mbuf0 */
>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>> rte_ether_hdr *);
>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>>> hdr_checksum;
>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>> time_to_live;
>>>> +
>>>> +		/* Extract DIP of mbuf1 */
>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>> rte_ether_hdr *);
>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr-
>>>> hdr_checksum;
>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>> time_to_live;
>>>> +
>>>> +		/* Extract DIP of mbuf2 */
>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>> rte_ether_hdr *);
>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr-
>>>> hdr_checksum;
>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>> time_to_live;
>>>> +
>>>> +		/* Extract DIP of mbuf3 */
>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>> rte_ether_hdr *);
>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>> +
>>>> +		/* Prepare for lookup x4 */
>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>> +
>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>> +
>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr-
>>>> hdr_checksum;
>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>> time_to_live;
>>>> +
>>>> +		/* Perform LPM lookup to get NH and next node */
>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>> +
>>>> +		/* Extract next node id and NH */
>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] &
>>> 0xFFFF;
>>>> +		next0 = (dst.u32[0] >> 16);
>>>> +
>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] &
>>> 0xFFFF;
>>>> +		next1 = (dst.u32[1] >> 16);
>>>> +
>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] &
>>> 0xFFFF;
>>>> +		next2 = (dst.u32[2] >> 16);
>>>> +
>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] &
>>> 0xFFFF;
>>>> +		next3 = (dst.u32[3] >> 16);
>>>> +
>>>> +		/* Enqueue four to next node */
>>>> +		rte_edge_t fix_spec =
>>>> +			(next_index ^ next0) | (next_index ^ next1) |
>>>> +			(next_index ^ next2) | (next_index ^ next3);
>>>> +
>>>> +		if (unlikely(fix_spec)) {
>>>> +			/* Copy things successfully speculated till now
>>> */
>>>> +			rte_memcpy(to_next, from, last_spec *
>>> sizeof(from[0]));
>>>> +			from += last_spec;
>>>> +			to_next += last_spec;
>>>> +			held += last_spec;
>>>> +			last_spec = 0;
>>>> +
>>>> +			/* Next0 */
>>>> +			if (next_index == next0) {
>>>> +				to_next[0] = from[0];
>>>> +				to_next++;
>>>> +				held++;
>>>> +			} else {
>>>> +				rte_node_enqueue_x1(graph, node,
>>> next0,
>>>> +						    from[0]);
>>>> +			}
>>>> +
>>>> +			/* Next1 */
>>>> +			if (next_index == next1) {
>>>> +				to_next[0] = from[1];
>>>> +				to_next++;
>>>> +				held++;
>>>> +			} else {
>>>> +				rte_node_enqueue_x1(graph, node,
>>> next1,
>>>> +						    from[1]);
>>>> +			}
>>>> +
>>>> +			/* Next2 */
>>>> +			if (next_index == next2) {
>>>> +				to_next[0] = from[2];
>>>> +				to_next++;
>>>> +				held++;
>>>> +			} else {
>>>> +				rte_node_enqueue_x1(graph, node,
>>> next2,
>>>> +						    from[2]);
>>>> +			}
>>>> +
>>>> +			/* Next3 */
>>>> +			if (next_index == next3) {
>>>> +				to_next[0] = from[3];
>>>> +				to_next++;
>>>> +				held++;
>>>> +			} else {
>>>> +				rte_node_enqueue_x1(graph, node,
>>> next3,
>>>> +						    from[3]);
>>>> +			}
>>>> +
>>>> +			from += 4;
>>>> +
>>>> +		} else {
>>>> +			last_spec += 4;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	while (n_left_from > 0) {
>>>> +		uint32_t next_hop;
>>>> +
>>>> +		mbuf0 = pkts[0];
>>>> +
>>>> +		pkts += 1;
>>>> +		n_left_from -= 1;
>>>> +
>>>> +		/* Extract DIP of mbuf0 */
>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>> rte_ether_hdr *);
>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>>> hdr_checksum;
>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>> time_to_live;
>>>> +
>>>> +		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr-
>>>> dst_addr),
>>>> +				    &next_hop);
>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>> +
>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop &
>>> 0xFFFF;
>>>> +		next0 = (next_hop >> 16);
>>>> +
>>>> +		if (unlikely(next_index ^ next0)) {
>>>> +			/* Copy things successfully speculated till now
>>> */
>>>> +			rte_memcpy(to_next, from, last_spec *
>>> sizeof(from[0]));
>>>> +			from += last_spec;
>>>> +			to_next += last_spec;
>>>> +			held += last_spec;
>>>> +			last_spec = 0;
>>>> +
>>>> +			rte_node_enqueue_x1(graph, node, next0,
>>> from[0]);
>>>> +			from += 1;
>>>> +		} else {
>>>> +			last_spec += 1;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	/* !!! Home run !!! */
>>>> +	if (likely(last_spec == nb_objs)) {
>>>> +		rte_node_next_stream_move(graph, node,
>>> next_index);
>>>> +		return nb_objs;
>>>> +	}
>>>> +
>>>> +	held += last_spec;
>>>> +	/* Copy things successfully speculated till now */
>>>> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>>>> +	rte_node_next_stream_put(graph, node, next_index, held);
>>>> +
>>>> +	return nb_objs;
>>>> +}
>>>> +
>>>>  #else
>>>>
>>>>  static uint16_t
>>>>
  
Ray Kinsella March 20, 2020, 9:14 a.m. UTC | #5
On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote:
> 
> 
>> -----Original Message-----
>> From: Ray Kinsella <mdr@ashroe.eu>
>> Sent: Thursday, March 19, 2020 9:21 PM
>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
>> <ndabilpuram@marvell.com>
>> Cc: dev@dpdk.org; thomas@monjalon.net;
>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>> Kumar Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>> for x86
>>
>>
>>
>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>
>>>>> Add IPv4 lookup process function for ip4_lookup
>>>>> rte_node. This node performs LPM lookup using x86_64
>>>>> vector supported RTE_LPM API on every packet received
>>>>> and forwards it to a next node that is identified by
>>>>> lookup result.
>>>>>
>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>> ---
>>>>>  lib/librte_node/ip4_lookup.c | 245
>>>> +++++++++++++++++++++++++++++++++++
>>>>>  1 file changed, 245 insertions(+)
>>>>>
>>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>>> b/lib/librte_node/ip4_lookup.c
>>>>> index d7fcd1158..c003e9c91 100644
>>>>> --- a/lib/librte_node/ip4_lookup.c
>>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>> rte_graph
>>>> *graph, struct rte_node *node,
>>>>>  	return nb_objs;
>>>>>  }
>>>>>
>>>>> +#elif defined(RTE_ARCH_X86)
>>>>> +
>>>>> +/* X86 SSE */
>>>>> +static uint16_t
>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>> rte_node
>>>> *node,
>>>>> +			void **objs, uint16_t nb_objs)
>>>>> +{
>>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>>> +	struct rte_ether_hdr *eth_hdr;
>>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>>> +	void **to_next, **from;
>>>>> +	uint16_t last_spec = 0;
>>>>> +	uint16_t n_left_from;
>>>>> +	struct rte_lpm *lpm;
>>>>> +	uint16_t held = 0;
>>>>> +	uint32_t drop_nh;
>>>>> +	rte_xmm_t dst;
>>>>> +	__m128i dip; /* SSE register */
>>>>> +	int rc, i;
>>>>> +
>>>>> +	/* Speculative next */
>>>>> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>>> +	/* Drop node */
>>>>> +	drop_nh =
>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>>> +
>>>>> +	/* Get socket specific LPM from ctx */
>>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>>> +
>>>>> +	pkts = (struct rte_mbuf **)objs;
>>>>> +	from = objs;
>>>>> +	n_left_from = nb_objs;
>>>>
>>>> I doubt this initial prefetch of the first 4 packets has any benefit.
>>>
>>> Ack will remove in v2 for x86.
>>>
>>>>
>>>>> +	if (n_left_from >= 4) {
>>>>> +		for (i = 0; i < 4; i++) {
>>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>>> +						       struct rte_ether_hdr
>>>> *) +
>>>>> +				      1);
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	/* Get stream for the speculated next node */
>>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>>> next_index, nb_objs);
>>>>
>>>> Suggest you don't reuse the hand-unrolling optimization from FD.io
>>>> VPP.
>>>> I have never found any performance benefit from them, and they
>>>> make the code unnecessarily verbose.
>>>>
>>>
>>> How would be take the benefit of rte_lpm_lookupx4 without
>> unrolling the loop?.
>>> Also, in future if we are using rte_rib and fib with a CPU supporting
>> wider SIMD we might
>>> need to unroll them further (AVX256 AND 512 currently
>> rte_lpm_lookup uses only 128bit
>>> since it is only uses SSE extension).
>>
>> Let the compiler do it for you, but using a constant vector length.
>> for (int i=0; i < 4; ++i) { ... }
>>
> 
> Ok, I think I misunderstood the previous comment. 
> It was only for the prefetches in the loop right?


no, it was for all the needless repetition.
hand-unrolling loops serve no purpose but to add verbosity. 

> 
>>>
>>>>
>>>>> +	while (n_left_from >= 4) {
>>>>> +		/* Prefetch next-next mbufs */
>>>>> +		if (likely(n_left_from >= 11)) {
>>>>> +			rte_prefetch0(pkts[8]);
>>>>> +			rte_prefetch0(pkts[9]);
>>>>> +			rte_prefetch0(pkts[10]);
>>>>> +			rte_prefetch0(pkts[11]);
>>>>> +		}
>>>>> +
>>>>> +		/* Prefetch next mbuf data */
>>>>> +		if (likely(n_left_from >= 7)) {
>>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>>> +						       struct rte_ether_hdr
>>>> *) +
>>>>> +				      1);
>>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>>> +						       struct rte_ether_hdr
>>>> *) +
>>>>> +				      1);
>>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>>> +						       struct rte_ether_hdr
>>>> *) +
>>>>> +				      1);
>>>>> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>>> +						       struct rte_ether_hdr
>>>> *) +
>>>>> +				      1);
>>>>> +		}
>>>>> +
>>>>> +		mbuf0 = pkts[0];
>>>>> +		mbuf1 = pkts[1];
>>>>> +		mbuf2 = pkts[2];
>>>>> +		mbuf3 = pkts[3];
>>>>> +
>>>>> +		pkts += 4;
>>>>> +		n_left_from -= 4;
>>>>> +
>>>>> +		/* Extract DIP of mbuf0 */
>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>> rte_ether_hdr *);
>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>>>> hdr_checksum;
>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>> time_to_live;
>>>>> +
>>>>> +		/* Extract DIP of mbuf1 */
>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>>> rte_ether_hdr *);
>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr-
>>>>> hdr_checksum;
>>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>>> time_to_live;
>>>>> +
>>>>> +		/* Extract DIP of mbuf2 */
>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>>> rte_ether_hdr *);
>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr-
>>>>> hdr_checksum;
>>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>>> time_to_live;
>>>>> +
>>>>> +		/* Extract DIP of mbuf3 */
>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>>> rte_ether_hdr *);
>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>>> +
>>>>> +		/* Prepare for lookup x4 */
>>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>>> +
>>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
>>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>>> +
>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr-
>>>>> hdr_checksum;
>>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>>> time_to_live;
>>>>> +
>>>>> +		/* Perform LPM lookup to get NH and next node */
>>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>>> +
>>>>> +		/* Extract next node id and NH */
>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] &
>>>> 0xFFFF;
>>>>> +		next0 = (dst.u32[0] >> 16);
>>>>> +
>>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] &
>>>> 0xFFFF;
>>>>> +		next1 = (dst.u32[1] >> 16);
>>>>> +
>>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] &
>>>> 0xFFFF;
>>>>> +		next2 = (dst.u32[2] >> 16);
>>>>> +
>>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] &
>>>> 0xFFFF;
>>>>> +		next3 = (dst.u32[3] >> 16);
>>>>> +
>>>>> +		/* Enqueue four to next node */
>>>>> +		rte_edge_t fix_spec =
>>>>> +			(next_index ^ next0) | (next_index ^ next1) |
>>>>> +			(next_index ^ next2) | (next_index ^ next3);
>>>>> +
>>>>> +		if (unlikely(fix_spec)) {
>>>>> +			/* Copy things successfully speculated till now
>>>> */
>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>> sizeof(from[0]));
>>>>> +			from += last_spec;
>>>>> +			to_next += last_spec;
>>>>> +			held += last_spec;
>>>>> +			last_spec = 0;
>>>>> +
>>>>> +			/* Next0 */
>>>>> +			if (next_index == next0) {
>>>>> +				to_next[0] = from[0];
>>>>> +				to_next++;
>>>>> +				held++;
>>>>> +			} else {
>>>>> +				rte_node_enqueue_x1(graph, node,
>>>> next0,
>>>>> +						    from[0]);
>>>>> +			}
>>>>> +
>>>>> +			/* Next1 */
>>>>> +			if (next_index == next1) {
>>>>> +				to_next[0] = from[1];
>>>>> +				to_next++;
>>>>> +				held++;
>>>>> +			} else {
>>>>> +				rte_node_enqueue_x1(graph, node,
>>>> next1,
>>>>> +						    from[1]);
>>>>> +			}
>>>>> +
>>>>> +			/* Next2 */
>>>>> +			if (next_index == next2) {
>>>>> +				to_next[0] = from[2];
>>>>> +				to_next++;
>>>>> +				held++;
>>>>> +			} else {
>>>>> +				rte_node_enqueue_x1(graph, node,
>>>> next2,
>>>>> +						    from[2]);
>>>>> +			}
>>>>> +
>>>>> +			/* Next3 */
>>>>> +			if (next_index == next3) {
>>>>> +				to_next[0] = from[3];
>>>>> +				to_next++;
>>>>> +				held++;
>>>>> +			} else {
>>>>> +				rte_node_enqueue_x1(graph, node,
>>>> next3,
>>>>> +						    from[3]);
>>>>> +			}
>>>>> +
>>>>> +			from += 4;
>>>>> +
>>>>> +		} else {
>>>>> +			last_spec += 4;
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	while (n_left_from > 0) {
>>>>> +		uint32_t next_hop;
>>>>> +
>>>>> +		mbuf0 = pkts[0];
>>>>> +
>>>>> +		pkts += 1;
>>>>> +		n_left_from -= 1;
>>>>> +
>>>>> +		/* Extract DIP of mbuf0 */
>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>> rte_ether_hdr *);
>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr-
>>>>> hdr_checksum;
>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>> time_to_live;
>>>>> +
>>>>> +		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr-
>>>>> dst_addr),
>>>>> +				    &next_hop);
>>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>>> +
>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop &
>>>> 0xFFFF;
>>>>> +		next0 = (next_hop >> 16);
>>>>> +
>>>>> +		if (unlikely(next_index ^ next0)) {
>>>>> +			/* Copy things successfully speculated till now
>>>> */
>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>> sizeof(from[0]));
>>>>> +			from += last_spec;
>>>>> +			to_next += last_spec;
>>>>> +			held += last_spec;
>>>>> +			last_spec = 0;
>>>>> +
>>>>> +			rte_node_enqueue_x1(graph, node, next0,
>>>> from[0]);
>>>>> +			from += 1;
>>>>> +		} else {
>>>>> +			last_spec += 1;
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	/* !!! Home run !!! */
>>>>> +	if (likely(last_spec == nb_objs)) {
>>>>> +		rte_node_next_stream_move(graph, node,
>>>> next_index);
>>>>> +		return nb_objs;
>>>>> +	}
>>>>> +
>>>>> +	held += last_spec;
>>>>> +	/* Copy things successfully speculated till now */
>>>>> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>>>>> +	rte_node_next_stream_put(graph, node, next_index, held);
>>>>> +
>>>>> +	return nb_objs;
>>>>> +}
>>>>> +
>>>>>  #else
>>>>>
>>>>>  static uint16_t
>>>>>
  
Pavan Nikhilesh Bhagavatula March 24, 2020, 9:40 a.m. UTC | #6
Hi Ray, 

I have tried to avoid hand unrolling loops and found the following observations.

1. Although it decreases LOC it also takes away readability too.
	Example: 
	Avoiding unrolled code below

                priv[0].u64[0] = rte_node_mbuf_priv1(mbuf[0])->u;
                priv[0].u64[1] = rte_node_mbuf_priv1(mbuf[1])->u;
                priv[1].u64[0] = rte_node_mbuf_priv1(mbuf[2])->u;
                priv[1].u64[1] = rte_node_mbuf_priv1(mbuf[3])->u;

                /* Increment checksum by one. */
                priv[0].u32[1] += rte_cpu_to_be_16(0x0100);
                priv[0].u32[3] += rte_cpu_to_be_16(0x0100);
                priv[1].u32[1] += rte_cpu_to_be_16(0x0100);
                priv[1].u32[3] += rte_cpu_to_be_16(0x0100);

                d = rte_pktmbuf_mtod(mbuf[0], void *);
                rte_memcpy(d, nh[priv[0].u16[0]].rewrite_data,
                                nh[priv[0].u16[0]].rewrite_len);
                next[0] = nh[priv[0].u16[0]].tx_node;
                ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                                sizeof(struct rte_ether_hdr));
                ip->time_to_live = priv[0].u16[1] - 1;
                ip->hdr_checksum = priv[0].u16[2] + priv[0].u16[3];

                d = rte_pktmbuf_mtod(mbuf[1], void *);
                rte_memcpy(d, nh[priv[0].u16[4]].rewrite_data,
                           nh[priv[0].u16[4]].rewrite_len);
                next[1] = nh[priv[0].u16[4]].tx_node;
                ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                      sizeof(struct rte_ether_hdr));
                ip->time_to_live = priv[0].u16[5] - 1;
                ip->hdr_checksum = priv[0].u16[6] + priv[0].u16[7];

                d = rte_pktmbuf_mtod(mbuf[2], void *);
                rte_memcpy(d, nh[priv[1].u16[0]].rewrite_data,
                           nh[priv[1].u16[0]].rewrite_len);
                next[2] = nh[priv[1].u16[0]].tx_node;
                ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                      sizeof(struct rte_ether_hdr));
                ip->time_to_live = priv[1].u16[1] - 1;
                ip->hdr_checksum = priv[1].u16[2] + priv[1].u16[3];

                d = rte_pktmbuf_mtod(mbuf[3], void *);
                rte_memcpy(d, nh[priv[1].u16[4]].rewrite_data,
                           nh[priv[1].u16[4]].rewrite_len);
                next[3] = nh[priv[1].u16[4]].tx_node;
                ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                      sizeof(struct rte_ether_hdr));
                ip->time_to_live = priv[1].u16[5] - 1;
                ip->hdr_checksum = priv[1].u16[6] + priv[1].u16[7];

	Leads to something like:
	
                for (i = 0, j = 0; i < BUF_PER_LOOP; i += 2, j++) {
                        priv[j].u64[0] = rte_node_mbuf_priv1(mbuf[i])->u;
                        priv[j].u32[1] += rte_cpu_to_be_16(0x0100);
                        d = rte_pktmbuf_mtod(mbuf[i], void *);
                        ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                                            sizeof(struct rte_ether_hdr));
                        ip->time_to_live = priv[j].u16[1] - 1;
                        ip->hdr_checksum = priv[j].u16[2] + priv[j].u16[3];
                        rte_memcpy(d, nh[priv[j].u16[0]].rewrite_data,
                                   nh[priv[j].u16[0]].rewrite_len);

                        next[i] = nh[priv[j].u16[0]].tx_node;

                        priv[j].u64[1] = rte_node_mbuf_priv1(mbuf[i + 1])->u;
                        priv[j].u32[3] += rte_cpu_to_be_16(0x0100);
                        d = rte_pktmbuf_mtod(mbuf[i + 1], void *);
                        ip = (struct rte_ipv4_hdr *)((uint8_t *)d +
                                        sizeof(struct rte_ether_hdr));
                        ip->time_to_live = priv[j].u16[5] - 1;
                        ip->hdr_checksum = priv[j].u16[6] + priv[j].u16[7];
                        rte_memcpy(d, nh[priv[j].u16[4]].rewrite_data,
                                   nh[priv[j].u16[4]].rewrite_len);

                        next[i + 1] = nh[priv[j].u16[4]].tx_node;
                }
	Which is kind of unreadable.

2. Not all compilers are made equal. I found that most of the compilers don’t 
     Unroll the loop above even when compiled with `-funroll-all-loops`.
     I have checked with following compilers:
	GCC 9.2.0
	Clang 9.0.1
	Aarch64 GCC 7.3.0
	Aarch64 GCC 9.2.0
			

3. Performance wise I see a lot of degradation on our platform at least 13%.
    On IA with a Broadwell(Xeon E5-2690) and i40e the performance remain same w.r.t Rx/Tx since the 
    hotspot is in the Tx path of the driver which limits the per core capability.
    But the performance difference in number of cycles per node can be seen below:

	Hand unrolling:
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|Node                           |calls          |objs           |realloc_count  |objs/call      |objs/sec(10E6) |cycles/call|
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|ip4_lookup                     |7765918        |248509344      |1              |32.000         |27.725408      |779.0000   |
|ip4_rewrite                    |7765925        |248509568      |1              |32.000         |27.725408      |425.0000   |
|ethdev_tx-1                    |7765927        |204056223      |1              |26.000         |22.762720      |597.0000   |
|pkt_drop                       |1389170        |44453409       |1              |32.000         |4.962688       |298.0000   |
|ethdev_rx-0-0                  |63604111       |248509792      |2              |32.000         |27.725408      |982.0000   |
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
	
	W/o unrolling:

+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|Node                           |calls          |objs           |realloc_count  |objs/call      |objs/sec(10E6) |cycles/call|
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
|ip4_lookup                     |18864640       |603668448      |1              |32.000         |26.051328      |828.0000   |
|ip4_rewrite                    |18864646       |603668640      |1              |32.000         |26.051328      |534.0000   |
|ethdev_tx-1                    |18864648       |527874175      |1              |27.000         |22.780256      |633.0000   |
|pkt_drop                       |2368580        |75794529       |1              |32.000         |3.271072       |286.0000   |
|ethdev_rx-0-0                  |282058226      |603668864      |2              |32.000         |26.051328      |994.0000   |
+-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+

Considering the above findings we would like to continue unrolling the loops by hand.

Regards,
Pavan.

>-----Original Message-----
>From: Ray Kinsella <mdr@ashroe.eu>
>Sent: Friday, March 20, 2020 2:44 PM
>To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
><ndabilpuram@marvell.com>
>Cc: dev@dpdk.org; thomas@monjalon.net;
>david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>Kumar Kokkilagadda <kirankumark@marvell.com>
>Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>for x86
>
>
>
>On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote:
>>
>>
>>> -----Original Message-----
>>> From: Ray Kinsella <mdr@ashroe.eu>
>>> Sent: Thursday, March 19, 2020 9:21 PM
>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>Jerin
>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>Dabilpuram
>>> <ndabilpuram@marvell.com>
>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>Kiran
>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>lookup
>>> for x86
>>>
>>>
>>>
>>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>
>>>>>> Add IPv4 lookup process function for ip4_lookup
>>>>>> rte_node. This node performs LPM lookup using x86_64
>>>>>> vector supported RTE_LPM API on every packet received
>>>>>> and forwards it to a next node that is identified by
>>>>>> lookup result.
>>>>>>
>>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>>> ---
>>>>>>  lib/librte_node/ip4_lookup.c | 245
>>>>> +++++++++++++++++++++++++++++++++++
>>>>>>  1 file changed, 245 insertions(+)
>>>>>>
>>>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>>>> b/lib/librte_node/ip4_lookup.c
>>>>>> index d7fcd1158..c003e9c91 100644
>>>>>> --- a/lib/librte_node/ip4_lookup.c
>>>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>>> rte_graph
>>>>> *graph, struct rte_node *node,
>>>>>>  	return nb_objs;
>>>>>>  }
>>>>>>
>>>>>> +#elif defined(RTE_ARCH_X86)
>>>>>> +
>>>>>> +/* X86 SSE */
>>>>>> +static uint16_t
>>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>>> rte_node
>>>>> *node,
>>>>>> +			void **objs, uint16_t nb_objs)
>>>>>> +{
>>>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3,
>**pkts;
>>>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>>>> +	struct rte_ether_hdr *eth_hdr;
>>>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>>>> +	void **to_next, **from;
>>>>>> +	uint16_t last_spec = 0;
>>>>>> +	uint16_t n_left_from;
>>>>>> +	struct rte_lpm *lpm;
>>>>>> +	uint16_t held = 0;
>>>>>> +	uint32_t drop_nh;
>>>>>> +	rte_xmm_t dst;
>>>>>> +	__m128i dip; /* SSE register */
>>>>>> +	int rc, i;
>>>>>> +
>>>>>> +	/* Speculative next */
>>>>>> +	next_index =
>RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>>>> +	/* Drop node */
>>>>>> +	drop_nh =
>>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>>>> +
>>>>>> +	/* Get socket specific LPM from ctx */
>>>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>>>> +
>>>>>> +	pkts = (struct rte_mbuf **)objs;
>>>>>> +	from = objs;
>>>>>> +	n_left_from = nb_objs;
>>>>>
>>>>> I doubt this initial prefetch of the first 4 packets has any benefit.
>>>>
>>>> Ack will remove in v2 for x86.
>>>>
>>>>>
>>>>>> +	if (n_left_from >= 4) {
>>>>>> +		for (i = 0; i < 4; i++) {
>>>>>> +
>	rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>>>> +						       struct
>rte_ether_hdr
>>>>> *) +
>>>>>> +				      1);
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +	/* Get stream for the speculated next node */
>>>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>>>> next_index, nb_objs);
>>>>>
>>>>> Suggest you don't reuse the hand-unrolling optimization from
>FD.io
>>>>> VPP.
>>>>> I have never found any performance benefit from them, and they
>>>>> make the code unnecessarily verbose.
>>>>>
>>>>
>>>> How would be take the benefit of rte_lpm_lookupx4 without
>>> unrolling the loop?.
>>>> Also, in future if we are using rte_rib and fib with a CPU supporting
>>> wider SIMD we might
>>>> need to unroll them further (AVX256 AND 512 currently
>>> rte_lpm_lookup uses only 128bit
>>>> since it is only uses SSE extension).
>>>
>>> Let the compiler do it for you, but using a constant vector length.
>>> for (int i=0; i < 4; ++i) { ... }
>>>
>>
>> Ok, I think I misunderstood the previous comment.
>> It was only for the prefetches in the loop right?
>
>
>no, it was for all the needless repetition.
>hand-unrolling loops serve no purpose but to add verbosity.
>
>>
>>>>
>>>>>
>>>>>> +	while (n_left_from >= 4) {
>>>>>> +		/* Prefetch next-next mbufs */
>>>>>> +		if (likely(n_left_from >= 11)) {
>>>>>> +			rte_prefetch0(pkts[8]);
>>>>>> +			rte_prefetch0(pkts[9]);
>>>>>> +			rte_prefetch0(pkts[10]);
>>>>>> +			rte_prefetch0(pkts[11]);
>>>>>> +		}
>>>>>> +
>>>>>> +		/* Prefetch next mbuf data */
>>>>>> +		if (likely(n_left_from >= 7)) {
>>>>>> +
>	rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>>>> +						       struct
>rte_ether_hdr
>>>>> *) +
>>>>>> +				      1);
>>>>>> +
>	rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>>>> +						       struct
>rte_ether_hdr
>>>>> *) +
>>>>>> +				      1);
>>>>>> +
>	rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>>>> +						       struct
>rte_ether_hdr
>>>>> *) +
>>>>>> +				      1);
>>>>>> +
>	rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>>>> +						       struct
>rte_ether_hdr
>>>>> *) +
>>>>>> +				      1);
>>>>>> +		}
>>>>>> +
>>>>>> +		mbuf0 = pkts[0];
>>>>>> +		mbuf1 = pkts[1];
>>>>>> +		mbuf2 = pkts[2];
>>>>>> +		mbuf3 = pkts[3];
>>>>>> +
>>>>>> +		pkts += 4;
>>>>>> +		n_left_from -= 4;
>>>>>> +
>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>> rte_ether_hdr *);
>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>ipv4_hdr-
>>>>>> hdr_checksum;
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>> time_to_live;
>>>>>> +
>>>>>> +		/* Extract DIP of mbuf1 */
>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>>>> rte_ether_hdr *);
>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum =
>ipv4_hdr-
>>>>>> hdr_checksum;
>>>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>>>> time_to_live;
>>>>>> +
>>>>>> +		/* Extract DIP of mbuf2 */
>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>>>> rte_ether_hdr *);
>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum =
>ipv4_hdr-
>>>>>> hdr_checksum;
>>>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>>>> time_to_live;
>>>>>> +
>>>>>> +		/* Extract DIP of mbuf3 */
>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>>>> rte_ether_hdr *);
>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>>>> +
>>>>>> +		/* Prepare for lookup x4 */
>>>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>>>> +
>>>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1,
>2, 3);
>>>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>>>> +
>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum =
>ipv4_hdr-
>>>>>> hdr_checksum;
>>>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>>>> time_to_live;
>>>>>> +
>>>>>> +		/* Perform LPM lookup to get NH and next
>node */
>>>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>>>> +
>>>>>> +		/* Extract next node id and NH */
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0]
>&
>>>>> 0xFFFF;
>>>>>> +		next0 = (dst.u32[0] >> 16);
>>>>>> +
>>>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1]
>&
>>>>> 0xFFFF;
>>>>>> +		next1 = (dst.u32[1] >> 16);
>>>>>> +
>>>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2]
>&
>>>>> 0xFFFF;
>>>>>> +		next2 = (dst.u32[2] >> 16);
>>>>>> +
>>>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3]
>&
>>>>> 0xFFFF;
>>>>>> +		next3 = (dst.u32[3] >> 16);
>>>>>> +
>>>>>> +		/* Enqueue four to next node */
>>>>>> +		rte_edge_t fix_spec =
>>>>>> +			(next_index ^ next0) | (next_index ^
>next1) |
>>>>>> +			(next_index ^ next2) | (next_index ^
>next3);
>>>>>> +
>>>>>> +		if (unlikely(fix_spec)) {
>>>>>> +			/* Copy things successfully speculated
>till now
>>>>> */
>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>> sizeof(from[0]));
>>>>>> +			from += last_spec;
>>>>>> +			to_next += last_spec;
>>>>>> +			held += last_spec;
>>>>>> +			last_spec = 0;
>>>>>> +
>>>>>> +			/* Next0 */
>>>>>> +			if (next_index == next0) {
>>>>>> +				to_next[0] = from[0];
>>>>>> +				to_next++;
>>>>>> +				held++;
>>>>>> +			} else {
>>>>>> +				rte_node_enqueue_x1(graph,
>node,
>>>>> next0,
>>>>>> +						    from[0]);
>>>>>> +			}
>>>>>> +
>>>>>> +			/* Next1 */
>>>>>> +			if (next_index == next1) {
>>>>>> +				to_next[0] = from[1];
>>>>>> +				to_next++;
>>>>>> +				held++;
>>>>>> +			} else {
>>>>>> +				rte_node_enqueue_x1(graph,
>node,
>>>>> next1,
>>>>>> +						    from[1]);
>>>>>> +			}
>>>>>> +
>>>>>> +			/* Next2 */
>>>>>> +			if (next_index == next2) {
>>>>>> +				to_next[0] = from[2];
>>>>>> +				to_next++;
>>>>>> +				held++;
>>>>>> +			} else {
>>>>>> +				rte_node_enqueue_x1(graph,
>node,
>>>>> next2,
>>>>>> +						    from[2]);
>>>>>> +			}
>>>>>> +
>>>>>> +			/* Next3 */
>>>>>> +			if (next_index == next3) {
>>>>>> +				to_next[0] = from[3];
>>>>>> +				to_next++;
>>>>>> +				held++;
>>>>>> +			} else {
>>>>>> +				rte_node_enqueue_x1(graph,
>node,
>>>>> next3,
>>>>>> +						    from[3]);
>>>>>> +			}
>>>>>> +
>>>>>> +			from += 4;
>>>>>> +
>>>>>> +		} else {
>>>>>> +			last_spec += 4;
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +	while (n_left_from > 0) {
>>>>>> +		uint32_t next_hop;
>>>>>> +
>>>>>> +		mbuf0 = pkts[0];
>>>>>> +
>>>>>> +		pkts += 1;
>>>>>> +		n_left_from -= 1;
>>>>>> +
>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>> rte_ether_hdr *);
>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>ipv4_hdr-
>>>>>> hdr_checksum;
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>> time_to_live;
>>>>>> +
>>>>>> +		rc = rte_lpm_lookup(lpm,
>rte_be_to_cpu_32(ipv4_hdr-
>>>>>> dst_addr),
>>>>>> +				    &next_hop);
>>>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>>>> +
>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop
>&
>>>>> 0xFFFF;
>>>>>> +		next0 = (next_hop >> 16);
>>>>>> +
>>>>>> +		if (unlikely(next_index ^ next0)) {
>>>>>> +			/* Copy things successfully speculated
>till now
>>>>> */
>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>> sizeof(from[0]));
>>>>>> +			from += last_spec;
>>>>>> +			to_next += last_spec;
>>>>>> +			held += last_spec;
>>>>>> +			last_spec = 0;
>>>>>> +
>>>>>> +			rte_node_enqueue_x1(graph, node,
>next0,
>>>>> from[0]);
>>>>>> +			from += 1;
>>>>>> +		} else {
>>>>>> +			last_spec += 1;
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +	/* !!! Home run !!! */
>>>>>> +	if (likely(last_spec == nb_objs)) {
>>>>>> +		rte_node_next_stream_move(graph, node,
>>>>> next_index);
>>>>>> +		return nb_objs;
>>>>>> +	}
>>>>>> +
>>>>>> +	held += last_spec;
>>>>>> +	/* Copy things successfully speculated till now */
>>>>>> +	rte_memcpy(to_next, from, last_spec *
>sizeof(from[0]));
>>>>>> +	rte_node_next_stream_put(graph, node, next_index,
>held);
>>>>>> +
>>>>>> +	return nb_objs;
>>>>>> +}
>>>>>> +
>>>>>>  #else
>>>>>>
>>>>>>  static uint16_t
>>>>>>
  
Ray Kinsella March 24, 2020, 2:38 p.m. UTC | #7
On 24/03/2020 09:40, Pavan Nikhilesh Bhagavatula wrote:
> Hi Ray, 
> 
> I have tried to avoid hand unrolling loops and found the following observations.
> 
> 1. Although it decreases LOC it also takes away readability too.
> 	Example: 
> 	Avoiding unrolled code below
[SNIP] 
> 	Which is kind of unreadable.

I am confused - isn't it exactly the same code?
You still haven't completely unrolled the loop either?

I don't know how one is readable and the other is not. 

> 
> 2. Not all compilers are made equal. I found that most of the compilers don’t 
>      Unroll the loop above even when compiled with `-funroll-all-loops`.
>      I have checked with following compilers:
> 	GCC 9.2.0
> 	Clang 9.0.1
> 	Aarch64 GCC 7.3.0
> 	Aarch64 GCC 9.2.0

Compilers have been unrolling fixed length loops for as long time - this isn't new technology.

If the compiler isn't unrolling you are doing something that makes it think it is a bad idea.
Hand unrolling the loop isn't the solution, understanding what the compiler is doing is a better idea.

In front of your for loop insert, to indicate to the compiler what you want to do. 
#pragma unroll BUF_PER_LOOP 

With clang you can ask it why it is not unrolling the loop with the following switches. 
(output is verbose, but the reason is in there). 

-Rpass=loop-unroll -Rpass-missed=loop-unroll
 			
> 
> 3. Performance wise I see a lot of degradation on our platform at least 13%.

Is the loop being unrolled?

>     On IA with a Broadwell(Xeon E5-2690) and i40e the performance remain same w.r.t Rx/Tx since the 
>     hotspot is in the Tx path of the driver which limits the per core capability.
>     But the performance difference in number of cycles per node can be seen below:
> 
> 	Hand unrolling:
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |Node                           |calls          |objs           |realloc_count  |objs/call      |objs/sec(10E6) |cycles/call|
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |ip4_lookup                     |7765918        |248509344      |1              |32.000         |27.725408      |779.0000   |
> |ip4_rewrite                    |7765925        |248509568      |1              |32.000         |27.725408      |425.0000   |
> |ethdev_tx-1                    |7765927        |204056223      |1              |26.000         |22.762720      |597.0000   |
> |pkt_drop                       |1389170        |44453409       |1              |32.000         |4.962688       |298.0000   |
> |ethdev_rx-0-0                  |63604111       |248509792      |2              |32.000         |27.725408      |982.0000   |
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> 	
> 	W/o unrolling:
> 
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |Node                           |calls          |objs           |realloc_count  |objs/call      |objs/sec(10E6) |cycles/call|
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> |ip4_lookup                     |18864640       |603668448      |1              |32.000         |26.051328      |828.0000   |
> |ip4_rewrite                    |18864646       |603668640      |1              |32.000         |26.051328      |534.0000   |
> |ethdev_tx-1                    |18864648       |527874175      |1              |27.000         |22.780256      |633.0000   |
> |pkt_drop                       |2368580        |75794529       |1              |32.000         |3.271072       |286.0000   |
> |ethdev_rx-0-0                  |282058226      |603668864      |2              |32.000         |26.051328      |994.0000   |
> +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+
> 
> Considering the above findings we would like to continue unrolling the loops by hand.
> 
> Regards,
> Pavan.
> 
>> -----Original Message-----
>> From: Ray Kinsella <mdr@ashroe.eu>
>> Sent: Friday, March 20, 2020 2:44 PM
>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
>> <ndabilpuram@marvell.com>
>> Cc: dev@dpdk.org; thomas@monjalon.net;
>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>> Kumar Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>> for x86
>>
>>
>>
>> On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Ray Kinsella <mdr@ashroe.eu>
>>>> Sent: Thursday, March 19, 2020 9:21 PM
>>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>> Jerin
>>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>> Dabilpuram
>>>> <ndabilpuram@marvell.com>
>>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>> Kiran
>>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>> lookup
>>>> for x86
>>>>
>>>>
>>>>
>>>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>>>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>>
>>>>>>> Add IPv4 lookup process function for ip4_lookup
>>>>>>> rte_node. This node performs LPM lookup using x86_64
>>>>>>> vector supported RTE_LPM API on every packet received
>>>>>>> and forwards it to a next node that is identified by
>>>>>>> lookup result.
>>>>>>>
>>>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
>>>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>>>> ---
>>>>>>>  lib/librte_node/ip4_lookup.c | 245
>>>>>> +++++++++++++++++++++++++++++++++++
>>>>>>>  1 file changed, 245 insertions(+)
>>>>>>>
>>>>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>>>>> b/lib/librte_node/ip4_lookup.c
>>>>>>> index d7fcd1158..c003e9c91 100644
>>>>>>> --- a/lib/librte_node/ip4_lookup.c
>>>>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>>>> rte_graph
>>>>>> *graph, struct rte_node *node,
>>>>>>>  	return nb_objs;
>>>>>>>  }
>>>>>>>
>>>>>>> +#elif defined(RTE_ARCH_X86)
>>>>>>> +
>>>>>>> +/* X86 SSE */
>>>>>>> +static uint16_t
>>>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>>>> rte_node
>>>>>> *node,
>>>>>>> +			void **objs, uint16_t nb_objs)
>>>>>>> +{
>>>>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3,
>> **pkts;
>>>>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>>>>> +	struct rte_ether_hdr *eth_hdr;
>>>>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>>>>> +	void **to_next, **from;
>>>>>>> +	uint16_t last_spec = 0;
>>>>>>> +	uint16_t n_left_from;
>>>>>>> +	struct rte_lpm *lpm;
>>>>>>> +	uint16_t held = 0;
>>>>>>> +	uint32_t drop_nh;
>>>>>>> +	rte_xmm_t dst;
>>>>>>> +	__m128i dip; /* SSE register */
>>>>>>> +	int rc, i;
>>>>>>> +
>>>>>>> +	/* Speculative next */
>>>>>>> +	next_index =
>> RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>>>>> +	/* Drop node */
>>>>>>> +	drop_nh =
>>>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>>>>> +
>>>>>>> +	/* Get socket specific LPM from ctx */
>>>>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>>>>> +
>>>>>>> +	pkts = (struct rte_mbuf **)objs;
>>>>>>> +	from = objs;
>>>>>>> +	n_left_from = nb_objs;
>>>>>>
>>>>>> I doubt this initial prefetch of the first 4 packets has any benefit.
>>>>>
>>>>> Ack will remove in v2 for x86.
>>>>>
>>>>>>
>>>>>>> +	if (n_left_from >= 4) {
>>>>>>> +		for (i = 0; i < 4; i++) {
>>>>>>> +
>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>>>>> +						       struct
>> rte_ether_hdr
>>>>>> *) +
>>>>>>> +				      1);
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	/* Get stream for the speculated next node */
>>>>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>>>>> next_index, nb_objs);
>>>>>>
>>>>>> Suggest you don't reuse the hand-unrolling optimization from
>> FD.io
>>>>>> VPP.
>>>>>> I have never found any performance benefit from them, and they
>>>>>> make the code unnecessarily verbose.
>>>>>>
>>>>>
>>>>> How would be take the benefit of rte_lpm_lookupx4 without
>>>> unrolling the loop?.
>>>>> Also, in future if we are using rte_rib and fib with a CPU supporting
>>>> wider SIMD we might
>>>>> need to unroll them further (AVX256 AND 512 currently
>>>> rte_lpm_lookup uses only 128bit
>>>>> since it is only uses SSE extension).
>>>>
>>>> Let the compiler do it for you, but using a constant vector length.
>>>> for (int i=0; i < 4; ++i) { ... }
>>>>
>>>
>>> Ok, I think I misunderstood the previous comment.
>>> It was only for the prefetches in the loop right?
>>
>>
>> no, it was for all the needless repetition.
>> hand-unrolling loops serve no purpose but to add verbosity.
>>
>>>
>>>>>
>>>>>>
>>>>>>> +	while (n_left_from >= 4) {
>>>>>>> +		/* Prefetch next-next mbufs */
>>>>>>> +		if (likely(n_left_from >= 11)) {
>>>>>>> +			rte_prefetch0(pkts[8]);
>>>>>>> +			rte_prefetch0(pkts[9]);
>>>>>>> +			rte_prefetch0(pkts[10]);
>>>>>>> +			rte_prefetch0(pkts[11]);
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		/* Prefetch next mbuf data */
>>>>>>> +		if (likely(n_left_from >= 7)) {
>>>>>>> +
>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>>>>> +						       struct
>> rte_ether_hdr
>>>>>> *) +
>>>>>>> +				      1);
>>>>>>> +
>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>>>>> +						       struct
>> rte_ether_hdr
>>>>>> *) +
>>>>>>> +				      1);
>>>>>>> +
>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>>>>> +						       struct
>> rte_ether_hdr
>>>>>> *) +
>>>>>>> +				      1);
>>>>>>> +
>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>>>>> +						       struct
>> rte_ether_hdr
>>>>>> *) +
>>>>>>> +				      1);
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		mbuf0 = pkts[0];
>>>>>>> +		mbuf1 = pkts[1];
>>>>>>> +		mbuf2 = pkts[2];
>>>>>>> +		mbuf3 = pkts[3];
>>>>>>> +
>>>>>>> +		pkts += 4;
>>>>>>> +		n_left_from -= 4;
>>>>>>> +
>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>> rte_ether_hdr *);
>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>> ipv4_hdr-
>>>>>>> hdr_checksum;
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>> time_to_live;
>>>>>>> +
>>>>>>> +		/* Extract DIP of mbuf1 */
>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>>>>> rte_ether_hdr *);
>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum =
>> ipv4_hdr-
>>>>>>> hdr_checksum;
>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>>>>> time_to_live;
>>>>>>> +
>>>>>>> +		/* Extract DIP of mbuf2 */
>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>>>>> rte_ether_hdr *);
>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum =
>> ipv4_hdr-
>>>>>>> hdr_checksum;
>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>>>>> time_to_live;
>>>>>>> +
>>>>>>> +		/* Extract DIP of mbuf3 */
>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>>>>> rte_ether_hdr *);
>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>>>>> +
>>>>>>> +		/* Prepare for lookup x4 */
>>>>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>>>>> +
>>>>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1,
>> 2, 3);
>>>>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>>>>> +
>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum =
>> ipv4_hdr-
>>>>>>> hdr_checksum;
>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>>>>> time_to_live;
>>>>>>> +
>>>>>>> +		/* Perform LPM lookup to get NH and next
>> node */
>>>>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>>>>> +
>>>>>>> +		/* Extract next node id and NH */
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0]
>> &
>>>>>> 0xFFFF;
>>>>>>> +		next0 = (dst.u32[0] >> 16);
>>>>>>> +
>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1]
>> &
>>>>>> 0xFFFF;
>>>>>>> +		next1 = (dst.u32[1] >> 16);
>>>>>>> +
>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2]
>> &
>>>>>> 0xFFFF;
>>>>>>> +		next2 = (dst.u32[2] >> 16);
>>>>>>> +
>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3]
>> &
>>>>>> 0xFFFF;
>>>>>>> +		next3 = (dst.u32[3] >> 16);
>>>>>>> +
>>>>>>> +		/* Enqueue four to next node */
>>>>>>> +		rte_edge_t fix_spec =
>>>>>>> +			(next_index ^ next0) | (next_index ^
>> next1) |
>>>>>>> +			(next_index ^ next2) | (next_index ^
>> next3);
>>>>>>> +
>>>>>>> +		if (unlikely(fix_spec)) {
>>>>>>> +			/* Copy things successfully speculated
>> till now
>>>>>> */
>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>> sizeof(from[0]));
>>>>>>> +			from += last_spec;
>>>>>>> +			to_next += last_spec;
>>>>>>> +			held += last_spec;
>>>>>>> +			last_spec = 0;
>>>>>>> +
>>>>>>> +			/* Next0 */
>>>>>>> +			if (next_index == next0) {
>>>>>>> +				to_next[0] = from[0];
>>>>>>> +				to_next++;
>>>>>>> +				held++;
>>>>>>> +			} else {
>>>>>>> +				rte_node_enqueue_x1(graph,
>> node,
>>>>>> next0,
>>>>>>> +						    from[0]);
>>>>>>> +			}
>>>>>>> +
>>>>>>> +			/* Next1 */
>>>>>>> +			if (next_index == next1) {
>>>>>>> +				to_next[0] = from[1];
>>>>>>> +				to_next++;
>>>>>>> +				held++;
>>>>>>> +			} else {
>>>>>>> +				rte_node_enqueue_x1(graph,
>> node,
>>>>>> next1,
>>>>>>> +						    from[1]);
>>>>>>> +			}
>>>>>>> +
>>>>>>> +			/* Next2 */
>>>>>>> +			if (next_index == next2) {
>>>>>>> +				to_next[0] = from[2];
>>>>>>> +				to_next++;
>>>>>>> +				held++;
>>>>>>> +			} else {
>>>>>>> +				rte_node_enqueue_x1(graph,
>> node,
>>>>>> next2,
>>>>>>> +						    from[2]);
>>>>>>> +			}
>>>>>>> +
>>>>>>> +			/* Next3 */
>>>>>>> +			if (next_index == next3) {
>>>>>>> +				to_next[0] = from[3];
>>>>>>> +				to_next++;
>>>>>>> +				held++;
>>>>>>> +			} else {
>>>>>>> +				rte_node_enqueue_x1(graph,
>> node,
>>>>>> next3,
>>>>>>> +						    from[3]);
>>>>>>> +			}
>>>>>>> +
>>>>>>> +			from += 4;
>>>>>>> +
>>>>>>> +		} else {
>>>>>>> +			last_spec += 4;
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	while (n_left_from > 0) {
>>>>>>> +		uint32_t next_hop;
>>>>>>> +
>>>>>>> +		mbuf0 = pkts[0];
>>>>>>> +
>>>>>>> +		pkts += 1;
>>>>>>> +		n_left_from -= 1;
>>>>>>> +
>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>> rte_ether_hdr *);
>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>> ipv4_hdr-
>>>>>>> hdr_checksum;
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>> time_to_live;
>>>>>>> +
>>>>>>> +		rc = rte_lpm_lookup(lpm,
>> rte_be_to_cpu_32(ipv4_hdr-
>>>>>>> dst_addr),
>>>>>>> +				    &next_hop);
>>>>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>>>>> +
>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop
>> &
>>>>>> 0xFFFF;
>>>>>>> +		next0 = (next_hop >> 16);
>>>>>>> +
>>>>>>> +		if (unlikely(next_index ^ next0)) {
>>>>>>> +			/* Copy things successfully speculated
>> till now
>>>>>> */
>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>> sizeof(from[0]));
>>>>>>> +			from += last_spec;
>>>>>>> +			to_next += last_spec;
>>>>>>> +			held += last_spec;
>>>>>>> +			last_spec = 0;
>>>>>>> +
>>>>>>> +			rte_node_enqueue_x1(graph, node,
>> next0,
>>>>>> from[0]);
>>>>>>> +			from += 1;
>>>>>>> +		} else {
>>>>>>> +			last_spec += 1;
>>>>>>> +		}
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	/* !!! Home run !!! */
>>>>>>> +	if (likely(last_spec == nb_objs)) {
>>>>>>> +		rte_node_next_stream_move(graph, node,
>>>>>> next_index);
>>>>>>> +		return nb_objs;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	held += last_spec;
>>>>>>> +	/* Copy things successfully speculated till now */
>>>>>>> +	rte_memcpy(to_next, from, last_spec *
>> sizeof(from[0]));
>>>>>>> +	rte_node_next_stream_put(graph, node, next_index,
>> held);
>>>>>>> +
>>>>>>> +	return nb_objs;
>>>>>>> +}
>>>>>>> +
>>>>>>>  #else
>>>>>>>
>>>>>>>  static uint16_t
>>>>>>>
  
Pavan Nikhilesh Bhagavatula March 26, 2020, 9:56 a.m. UTC | #8
>-----Original Message-----
>From: Ray Kinsella <mdr@ashroe.eu>
>Sent: Tuesday, March 24, 2020 8:08 PM
>To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
><ndabilpuram@marvell.com>
>Cc: dev@dpdk.org; thomas@monjalon.net;
>david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>Kumar Kokkilagadda <kirankumark@marvell.com>
>Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>for x86
>
>
>
>On 24/03/2020 09:40, Pavan Nikhilesh Bhagavatula wrote:
>> Hi Ray,
>>
>> I have tried to avoid hand unrolling loops and found the following
>observations.
>>
>> 1. Although it decreases LOC it also takes away readability too.
>> 	Example:
>> 	Avoiding unrolled code below
>[SNIP]
>> 	Which is kind of unreadable.
>
>I am confused - isn't it exactly the same code?
>You still haven't completely unrolled the loop either?
>
>I don't know how one is readable and the other is not.

I guess it’s a matter of personal preference.

>
>>
>> 2. Not all compilers are made equal. I found that most of the
>compilers don’t
>>      Unroll the loop above even when compiled with `-funroll-all-loops`.
>>      I have checked with following compilers:
>> 	GCC 9.2.0
>> 	Clang 9.0.1
>> 	Aarch64 GCC 7.3.0
>> 	Aarch64 GCC 9.2.0
>
>Compilers have been unrolling fixed length loops for as long time - this
>isn't new technology.
>

In theory, I agree with your view, but even the latest compiler is not doing a decent 
job on unrolling the loop. 
We can revisit this scheme, if and when compiler smart enough to do this as just unrolling 
the loops is not good enough. It has to do it better than hand unrolling.
For example on arm64 GCC doesn’t merge load/stores to load/store pairs when unrolling.
(both ldr/str and ldp/stp latency is 3cyc and effectively ldp cost is halved).

Even on x86 we see extra ~100 cycles as mentioned in [1].

>If the compiler isn't unrolling you are doing something that makes it
>think it is a bad idea.
>Hand unrolling the loop isn't the solution, understanding what the
>compiler is doing is a better idea.
>
>In front of your for loop insert, to indicate to the compiler what you
>want to do.
>#pragma unroll BUF_PER_LOOP

Can you check which versions of compiler which this pragma works on?

Most of the gcc versions that I have tried just spit out the following warning.
../lib/librte_node/ip4_rewrite.c:59: warning: ignoring #pragma unroll BUF_PER_LOOP [-Wunknown-pragmas]
   59 |  #pragma unroll BUF_PER_LOOP


>
>With clang you can ask it why it is not unrolling the loop with the
>following switches.
>(output is verbose, but the reason is in there).
>
>-Rpass=loop-unroll -Rpass-missed=loop-unroll
>

../lib/librte_node/ip4_rewrite.c:57:2: remark: unrolled loop by a factor of 4 with run-time trip count [-Rpass=loop-unroll]
        for (i = 0; i < nb_objs; i++) {
        ^

>>
>> 3. Performance wise I see a lot of degradation on our platform at least
>13%.
>
>Is the loop being unrolled?
>

Yes, in a suboptimal way. https://pastebin.com/nkXvzMiW

we decide to stick with hand unrolling based on the test result instead of depending on compilers mercy to do it for us[2].
if you think, it can be improved then please submit a patch.

[2]https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760

>>     On IA with a Broadwell(Xeon E5-2690) and i40e the performance
>remain same w.r.t Rx/Tx since the
>>     hotspot is in the Tx path of the driver which limits the per core
>capability.
>>     But the performance difference in number of cycles per node can
>be seen below:
>>

[1]

>> 	Hand unrolling:
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>> |Node                           |calls          |objs           |realloc_count  |objs/call
>|objs/sec(10E6) |cycles/call|
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>> |ip4_lookup                     |7765918        |248509344      |1              |32.000
>|27.725408      |779.0000   |
>> |ip4_rewrite                    |7765925        |248509568      |1              |32.000
>|27.725408      |425.0000   |
>> |ethdev_tx-1                    |7765927        |204056223      |1              |26.000
>|22.762720      |597.0000   |
>> |pkt_drop                       |1389170        |44453409       |1              |32.000
>|4.962688       |298.0000   |
>> |ethdev_rx-0-0                  |63604111       |248509792      |2              |32.000
>|27.725408      |982.0000   |
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>>
>> 	W/o unrolling:
>>
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>> |Node                           |calls          |objs           |realloc_count  |objs/call
>|objs/sec(10E6) |cycles/call|
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>> |ip4_lookup                     |18864640       |603668448      |1              |32.000
>|26.051328      |828.0000   |
>> |ip4_rewrite                    |18864646       |603668640      |1              |32.000
>|26.051328      |534.0000   |
>> |ethdev_tx-1                    |18864648       |527874175      |1              |27.000
>|22.780256      |633.0000   |
>> |pkt_drop                       |2368580        |75794529       |1              |32.000
>|3.271072       |286.0000   |
>> |ethdev_rx-0-0                  |282058226      |603668864      |2              |32.000
>|26.051328      |994.0000   |
>> +-------------------------------+---------------+---------------+---------------+-
>--------------+---------------+-----------+
>>
>> Considering the above findings we would like to continue unrolling
>the loops by hand.
>>
>> Regards,
>> Pavan.
>>
>>> -----Original Message-----
>>> From: Ray Kinsella <mdr@ashroe.eu>
>>> Sent: Friday, March 20, 2020 2:44 PM
>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>Jerin
>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>Dabilpuram
>>> <ndabilpuram@marvell.com>
>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>Kiran
>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>lookup
>>> for x86
>>>
>>>
>>>
>>> On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Ray Kinsella <mdr@ashroe.eu>
>>>>> Sent: Thursday, March 19, 2020 9:21 PM
>>>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>>> Jerin
>>>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>>> Dabilpuram
>>>>> <ndabilpuram@marvell.com>
>>>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>>> Kiran
>>>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>>> lookup
>>>>> for x86
>>>>>
>>>>>
>>>>>
>>>>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>>>>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>>>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>>>
>>>>>>>> Add IPv4 lookup process function for ip4_lookup
>>>>>>>> rte_node. This node performs LPM lookup using x86_64
>>>>>>>> vector supported RTE_LPM API on every packet received
>>>>>>>> and forwards it to a next node that is identified by
>>>>>>>> lookup result.
>>>>>>>>
>>>>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>>> Signed-off-by: Nithin Dabilpuram
><ndabilpuram@marvell.com>
>>>>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>>>>> ---
>>>>>>>>  lib/librte_node/ip4_lookup.c | 245
>>>>>>> +++++++++++++++++++++++++++++++++++
>>>>>>>>  1 file changed, 245 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>>>>>> b/lib/librte_node/ip4_lookup.c
>>>>>>>> index d7fcd1158..c003e9c91 100644
>>>>>>>> --- a/lib/librte_node/ip4_lookup.c
>>>>>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>>>>> rte_graph
>>>>>>> *graph, struct rte_node *node,
>>>>>>>>  	return nb_objs;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +#elif defined(RTE_ARCH_X86)
>>>>>>>> +
>>>>>>>> +/* X86 SSE */
>>>>>>>> +static uint16_t
>>>>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>>>>> rte_node
>>>>>>> *node,
>>>>>>>> +			void **objs, uint16_t nb_objs)
>>>>>>>> +{
>>>>>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3,
>>> **pkts;
>>>>>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>>>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>>>>>> +	struct rte_ether_hdr *eth_hdr;
>>>>>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>>>>>> +	void **to_next, **from;
>>>>>>>> +	uint16_t last_spec = 0;
>>>>>>>> +	uint16_t n_left_from;
>>>>>>>> +	struct rte_lpm *lpm;
>>>>>>>> +	uint16_t held = 0;
>>>>>>>> +	uint32_t drop_nh;
>>>>>>>> +	rte_xmm_t dst;
>>>>>>>> +	__m128i dip; /* SSE register */
>>>>>>>> +	int rc, i;
>>>>>>>> +
>>>>>>>> +	/* Speculative next */
>>>>>>>> +	next_index =
>>> RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>>>>>> +	/* Drop node */
>>>>>>>> +	drop_nh =
>>>>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>>>>>> +
>>>>>>>> +	/* Get socket specific LPM from ctx */
>>>>>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>>>>>> +
>>>>>>>> +	pkts = (struct rte_mbuf **)objs;
>>>>>>>> +	from = objs;
>>>>>>>> +	n_left_from = nb_objs;
>>>>>>>
>>>>>>> I doubt this initial prefetch of the first 4 packets has any
>benefit.
>>>>>>
>>>>>> Ack will remove in v2 for x86.
>>>>>>
>>>>>>>
>>>>>>>> +	if (n_left_from >= 4) {
>>>>>>>> +		for (i = 0; i < 4; i++) {
>>>>>>>> +
>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>>>>>> +						       struct
>>> rte_ether_hdr
>>>>>>> *) +
>>>>>>>> +				      1);
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	/* Get stream for the speculated next node */
>>>>>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>>>>>> next_index, nb_objs);
>>>>>>>
>>>>>>> Suggest you don't reuse the hand-unrolling optimization from
>>> FD.io
>>>>>>> VPP.
>>>>>>> I have never found any performance benefit from them, and
>they
>>>>>>> make the code unnecessarily verbose.
>>>>>>>
>>>>>>
>>>>>> How would be take the benefit of rte_lpm_lookupx4 without
>>>>> unrolling the loop?.
>>>>>> Also, in future if we are using rte_rib and fib with a CPU
>supporting
>>>>> wider SIMD we might
>>>>>> need to unroll them further (AVX256 AND 512 currently
>>>>> rte_lpm_lookup uses only 128bit
>>>>>> since it is only uses SSE extension).
>>>>>
>>>>> Let the compiler do it for you, but using a constant vector length.
>>>>> for (int i=0; i < 4; ++i) { ... }
>>>>>
>>>>
>>>> Ok, I think I misunderstood the previous comment.
>>>> It was only for the prefetches in the loop right?
>>>
>>>
>>> no, it was for all the needless repetition.
>>> hand-unrolling loops serve no purpose but to add verbosity.
>>>
>>>>
>>>>>>
>>>>>>>
>>>>>>>> +	while (n_left_from >= 4) {
>>>>>>>> +		/* Prefetch next-next mbufs */
>>>>>>>> +		if (likely(n_left_from >= 11)) {
>>>>>>>> +			rte_prefetch0(pkts[8]);
>>>>>>>> +			rte_prefetch0(pkts[9]);
>>>>>>>> +			rte_prefetch0(pkts[10]);
>>>>>>>> +			rte_prefetch0(pkts[11]);
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		/* Prefetch next mbuf data */
>>>>>>>> +		if (likely(n_left_from >= 7)) {
>>>>>>>> +
>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>>>>>> +						       struct
>>> rte_ether_hdr
>>>>>>> *) +
>>>>>>>> +				      1);
>>>>>>>> +
>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>>>>>> +						       struct
>>> rte_ether_hdr
>>>>>>> *) +
>>>>>>>> +				      1);
>>>>>>>> +
>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>>>>>> +						       struct
>>> rte_ether_hdr
>>>>>>> *) +
>>>>>>>> +				      1);
>>>>>>>> +
>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>>>>>> +						       struct
>>> rte_ether_hdr
>>>>>>> *) +
>>>>>>>> +				      1);
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		mbuf0 = pkts[0];
>>>>>>>> +		mbuf1 = pkts[1];
>>>>>>>> +		mbuf2 = pkts[2];
>>>>>>>> +		mbuf3 = pkts[3];
>>>>>>>> +
>>>>>>>> +		pkts += 4;
>>>>>>>> +		n_left_from -= 4;
>>>>>>>> +
>>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>>> rte_ether_hdr *);
>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>>> ipv4_hdr-
>>>>>>>> hdr_checksum;
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>>> time_to_live;
>>>>>>>> +
>>>>>>>> +		/* Extract DIP of mbuf1 */
>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>>>>>> rte_ether_hdr *);
>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum =
>>> ipv4_hdr-
>>>>>>>> hdr_checksum;
>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>>>>>> time_to_live;
>>>>>>>> +
>>>>>>>> +		/* Extract DIP of mbuf2 */
>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>>>>>> rte_ether_hdr *);
>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum =
>>> ipv4_hdr-
>>>>>>>> hdr_checksum;
>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>>>>>> time_to_live;
>>>>>>>> +
>>>>>>>> +		/* Extract DIP of mbuf3 */
>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>>>>>> rte_ether_hdr *);
>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>>>>>> +
>>>>>>>> +		/* Prepare for lookup x4 */
>>>>>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>>>>>> +
>>>>>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>>>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>>>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1,
>>> 2, 3);
>>>>>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>>>>>> +
>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum =
>>> ipv4_hdr-
>>>>>>>> hdr_checksum;
>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>>>>>> time_to_live;
>>>>>>>> +
>>>>>>>> +		/* Perform LPM lookup to get NH and next
>>> node */
>>>>>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>>>>>> +
>>>>>>>> +		/* Extract next node id and NH */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0]
>>> &
>>>>>>> 0xFFFF;
>>>>>>>> +		next0 = (dst.u32[0] >> 16);
>>>>>>>> +
>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1]
>>> &
>>>>>>> 0xFFFF;
>>>>>>>> +		next1 = (dst.u32[1] >> 16);
>>>>>>>> +
>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2]
>>> &
>>>>>>> 0xFFFF;
>>>>>>>> +		next2 = (dst.u32[2] >> 16);
>>>>>>>> +
>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3]
>>> &
>>>>>>> 0xFFFF;
>>>>>>>> +		next3 = (dst.u32[3] >> 16);
>>>>>>>> +
>>>>>>>> +		/* Enqueue four to next node */
>>>>>>>> +		rte_edge_t fix_spec =
>>>>>>>> +			(next_index ^ next0) | (next_index ^
>>> next1) |
>>>>>>>> +			(next_index ^ next2) | (next_index ^
>>> next3);
>>>>>>>> +
>>>>>>>> +		if (unlikely(fix_spec)) {
>>>>>>>> +			/* Copy things successfully speculated
>>> till now
>>>>>>> */
>>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>>> sizeof(from[0]));
>>>>>>>> +			from += last_spec;
>>>>>>>> +			to_next += last_spec;
>>>>>>>> +			held += last_spec;
>>>>>>>> +			last_spec = 0;
>>>>>>>> +
>>>>>>>> +			/* Next0 */
>>>>>>>> +			if (next_index == next0) {
>>>>>>>> +				to_next[0] = from[0];
>>>>>>>> +				to_next++;
>>>>>>>> +				held++;
>>>>>>>> +			} else {
>>>>>>>> +				rte_node_enqueue_x1(graph,
>>> node,
>>>>>>> next0,
>>>>>>>> +						    from[0]);
>>>>>>>> +			}
>>>>>>>> +
>>>>>>>> +			/* Next1 */
>>>>>>>> +			if (next_index == next1) {
>>>>>>>> +				to_next[0] = from[1];
>>>>>>>> +				to_next++;
>>>>>>>> +				held++;
>>>>>>>> +			} else {
>>>>>>>> +				rte_node_enqueue_x1(graph,
>>> node,
>>>>>>> next1,
>>>>>>>> +						    from[1]);
>>>>>>>> +			}
>>>>>>>> +
>>>>>>>> +			/* Next2 */
>>>>>>>> +			if (next_index == next2) {
>>>>>>>> +				to_next[0] = from[2];
>>>>>>>> +				to_next++;
>>>>>>>> +				held++;
>>>>>>>> +			} else {
>>>>>>>> +				rte_node_enqueue_x1(graph,
>>> node,
>>>>>>> next2,
>>>>>>>> +						    from[2]);
>>>>>>>> +			}
>>>>>>>> +
>>>>>>>> +			/* Next3 */
>>>>>>>> +			if (next_index == next3) {
>>>>>>>> +				to_next[0] = from[3];
>>>>>>>> +				to_next++;
>>>>>>>> +				held++;
>>>>>>>> +			} else {
>>>>>>>> +				rte_node_enqueue_x1(graph,
>>> node,
>>>>>>> next3,
>>>>>>>> +						    from[3]);
>>>>>>>> +			}
>>>>>>>> +
>>>>>>>> +			from += 4;
>>>>>>>> +
>>>>>>>> +		} else {
>>>>>>>> +			last_spec += 4;
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	while (n_left_from > 0) {
>>>>>>>> +		uint32_t next_hop;
>>>>>>>> +
>>>>>>>> +		mbuf0 = pkts[0];
>>>>>>>> +
>>>>>>>> +		pkts += 1;
>>>>>>>> +		n_left_from -= 1;
>>>>>>>> +
>>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>>> rte_ether_hdr *);
>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>>> ipv4_hdr-
>>>>>>>> hdr_checksum;
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>>> time_to_live;
>>>>>>>> +
>>>>>>>> +		rc = rte_lpm_lookup(lpm,
>>> rte_be_to_cpu_32(ipv4_hdr-
>>>>>>>> dst_addr),
>>>>>>>> +				    &next_hop);
>>>>>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>>>>>> +
>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop
>>> &
>>>>>>> 0xFFFF;
>>>>>>>> +		next0 = (next_hop >> 16);
>>>>>>>> +
>>>>>>>> +		if (unlikely(next_index ^ next0)) {
>>>>>>>> +			/* Copy things successfully speculated
>>> till now
>>>>>>> */
>>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>>> sizeof(from[0]));
>>>>>>>> +			from += last_spec;
>>>>>>>> +			to_next += last_spec;
>>>>>>>> +			held += last_spec;
>>>>>>>> +			last_spec = 0;
>>>>>>>> +
>>>>>>>> +			rte_node_enqueue_x1(graph, node,
>>> next0,
>>>>>>> from[0]);
>>>>>>>> +			from += 1;
>>>>>>>> +		} else {
>>>>>>>> +			last_spec += 1;
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	/* !!! Home run !!! */
>>>>>>>> +	if (likely(last_spec == nb_objs)) {
>>>>>>>> +		rte_node_next_stream_move(graph, node,
>>>>>>> next_index);
>>>>>>>> +		return nb_objs;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	held += last_spec;
>>>>>>>> +	/* Copy things successfully speculated till now */
>>>>>>>> +	rte_memcpy(to_next, from, last_spec *
>>> sizeof(from[0]));
>>>>>>>> +	rte_node_next_stream_put(graph, node, next_index,
>>> held);
>>>>>>>> +
>>>>>>>> +	return nb_objs;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  #else
>>>>>>>>
>>>>>>>>  static uint16_t
>>>>>>>>
  
Ray Kinsella March 26, 2020, 4:54 p.m. UTC | #9
On 26/03/2020 09:56, Pavan Nikhilesh Bhagavatula wrote:
> 
>> -----Original Message-----
>> From: Ray Kinsella <mdr@ashroe.eu>
>> Sent: Tuesday, March 24, 2020 8:08 PM
>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar Dabilpuram
>> <ndabilpuram@marvell.com>
>> Cc: dev@dpdk.org; thomas@monjalon.net;
>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com; Kiran
>> Kumar Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup
>> for x86
>>
>>
>>
>> On 24/03/2020 09:40, Pavan Nikhilesh Bhagavatula wrote:
>>> Hi Ray,
>>>
>>> I have tried to avoid hand unrolling loops and found the following
>> observations.
>>>
>>> 1. Although it decreases LOC it also takes away readability too.
>>> 	Example:
>>> 	Avoiding unrolled code below
>> [SNIP]
>>> 	Which is kind of unreadable.
>>
>> I am confused - isn't it exactly the same code?
>> You still haven't completely unrolled the loop either?
>>
>> I don't know how one is readable and the other is not.
> 
> I guess it’s a matter of personal preference.

You have a rare preference for verbosity, sir.

> 
>>
>>>
>>> 2. Not all compilers are made equal. I found that most of the
>> compilers don’t
>>>      Unroll the loop above even when compiled with `-funroll-all-loops`.
>>>      I have checked with following compilers:
>>> 	GCC 9.2.0
>>> 	Clang 9.0.1
>>> 	Aarch64 GCC 7.3.0
>>> 	Aarch64 GCC 9.2.0
>>
>> Compilers have been unrolling fixed length loops for as long time - this
>> isn't new technology.
>>
> 
> In theory, I agree with your view, but even the latest compiler is not doing a decent 
> job on unrolling the loop. 
> We can revisit this scheme, if and when compiler smart enough to do this as just unrolling 
> the loops is not good enough. It has to do it better than hand unrolling.
> For example on arm64 GCC doesn’t merge load/stores to load/store pairs when unrolling.
> (both ldr/str and ldp/stp latency is 3cyc and effectively ldp cost is halved).
> 
> Even on x86 we see extra ~100 cycles as mentioned in [1].
> 
>> If the compiler isn't unrolling you are doing something that makes it
>> think it is a bad idea.
>> Hand unrolling the loop isn't the solution, understanding what the
>> compiler is doing is a better idea.
>>
>> In front of your for loop insert, to indicate to the compiler what you
>> want to do.
>> #pragma unroll BUF_PER_LOOP
> 
> Can you check which versions of compiler which this pragma works on?

https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html
https://clang.llvm.org/docs/AttributeReference.htm

> 
> Most of the gcc versions that I have tried just spit out the following warning.
> ../lib/librte_node/ip4_rewrite.c:59: warning: ignoring #pragma unroll BUF_PER_LOOP [-Wunknown-pragmas]
>    59 |  #pragma unroll BUF_PER_LOOP

Not sure on this one ... 

>>
>> With clang you can ask it why it is not unrolling the loop with the
>> following switches.
>> (output is verbose, but the reason is in there).
>>
>> -Rpass=loop-unroll -Rpass-missed=loop-unroll
>>
> 
> ../lib/librte_node/ip4_rewrite.c:57:2: remark: unrolled loop by a factor of 4 with run-time trip count [-Rpass=loop-unroll]
>         for (i = 0; i < nb_objs; i++) {
>         ^

That is good ... 

>>>
>>> 3. Performance wise I see a lot of degradation on our platform at least
>> 13%.
>>
>> Is the loop being unrolled?
>>
> 
> Yes, in a suboptimal way. https://pastebin.com/nkXvzMiW
> 
> we decide to stick with hand unrolling based on the test result instead of depending on compilers mercy to do it for us[2].

Why not code in assembler then, taking to it logical conclusion.

> if you think, it can be improved then please submit a patch.

I don't have an ARM platform to test on.

From my own work in FD.io VPP, which you are clear borrowing heavily from, to put it kindly. 
I have never seen an advantage from hand-unrolling compared to letting the compiler do it.

Perhaps this case is the exception.
I would be suspicious though that the compiler is making bad decisions for some reason.

In anycase, if you (and the presumabily the maintainer) are happy with verbose code like this making it's way into the codebase. 
I won't argue with it.

> [2]https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760
> 
>>>     On IA with a Broadwell(Xeon E5-2690) and i40e the performance
>> remain same w.r.t Rx/Tx since the
>>>     hotspot is in the Tx path of the driver which limits the per core
>> capability.
>>>     But the performance difference in number of cycles per node can
>> be seen below:
>>>
> 
> [1]
> 
>>> 	Hand unrolling:
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>> |Node                           |calls          |objs           |realloc_count  |objs/call
>> |objs/sec(10E6) |cycles/call|
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>> |ip4_lookup                     |7765918        |248509344      |1              |32.000
>> |27.725408      |779.0000   |
>>> |ip4_rewrite                    |7765925        |248509568      |1              |32.000
>> |27.725408      |425.0000   |
>>> |ethdev_tx-1                    |7765927        |204056223      |1              |26.000
>> |22.762720      |597.0000   |
>>> |pkt_drop                       |1389170        |44453409       |1              |32.000
>> |4.962688       |298.0000   |
>>> |ethdev_rx-0-0                  |63604111       |248509792      |2              |32.000
>> |27.725408      |982.0000   |
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>>
>>> 	W/o unrolling:
>>>
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>> |Node                           |calls          |objs           |realloc_count  |objs/call
>> |objs/sec(10E6) |cycles/call|
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>> |ip4_lookup                     |18864640       |603668448      |1              |32.000
>> |26.051328      |828.0000   |
>>> |ip4_rewrite                    |18864646       |603668640      |1              |32.000
>> |26.051328      |534.0000   |
>>> |ethdev_tx-1                    |18864648       |527874175      |1              |27.000
>> |22.780256      |633.0000   |
>>> |pkt_drop                       |2368580        |75794529       |1              |32.000
>> |3.271072       |286.0000   |
>>> |ethdev_rx-0-0                  |282058226      |603668864      |2              |32.000
>> |26.051328      |994.0000   |
>>> +-------------------------------+---------------+---------------+---------------+-
>> --------------+---------------+-----------+
>>>
>>> Considering the above findings we would like to continue unrolling
>> the loops by hand.
>>>
>>> Regards,
>>> Pavan.
>>>
>>>> -----Original Message-----
>>>> From: Ray Kinsella <mdr@ashroe.eu>
>>>> Sent: Friday, March 20, 2020 2:44 PM
>>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>> Jerin
>>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>> Dabilpuram
>>>> <ndabilpuram@marvell.com>
>>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>> Kiran
>>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>> lookup
>>>> for x86
>>>>
>>>>
>>>>
>>>> On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Ray Kinsella <mdr@ashroe.eu>
>>>>>> Sent: Thursday, March 19, 2020 9:21 PM
>>>>>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>;
>>>> Jerin
>>>>>> Jacob Kollanukkaran <jerinj@marvell.com>; Nithin Kumar
>>>> Dabilpuram
>>>>>> <ndabilpuram@marvell.com>
>>>>>> Cc: dev@dpdk.org; thomas@monjalon.net;
>>>>>> david.marchand@redhat.com; mattias.ronnblom@ericsson.com;
>>>> Kiran
>>>>>> Kumar Kokkilagadda <kirankumark@marvell.com>
>>>>>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4
>>>> lookup
>>>>>> for x86
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote:
>>>>>>>> On 18/03/2020 21:35, jerinj@marvell.com wrote:
>>>>>>>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>>>>
>>>>>>>>> Add IPv4 lookup process function for ip4_lookup
>>>>>>>>> rte_node. This node performs LPM lookup using x86_64
>>>>>>>>> vector supported RTE_LPM API on every packet received
>>>>>>>>> and forwards it to a next node that is identified by
>>>>>>>>> lookup result.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>>>>>>> Signed-off-by: Nithin Dabilpuram
>> <ndabilpuram@marvell.com>
>>>>>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>>>>>> ---
>>>>>>>>>  lib/librte_node/ip4_lookup.c | 245
>>>>>>>> +++++++++++++++++++++++++++++++++++
>>>>>>>>>  1 file changed, 245 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/lib/librte_node/ip4_lookup.c
>>>>>>>> b/lib/librte_node/ip4_lookup.c
>>>>>>>>> index d7fcd1158..c003e9c91 100644
>>>>>>>>> --- a/lib/librte_node/ip4_lookup.c
>>>>>>>>> +++ b/lib/librte_node/ip4_lookup.c
>>>>>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct
>>>>>> rte_graph
>>>>>>>> *graph, struct rte_node *node,
>>>>>>>>>  	return nb_objs;
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> +#elif defined(RTE_ARCH_X86)
>>>>>>>>> +
>>>>>>>>> +/* X86 SSE */
>>>>>>>>> +static uint16_t
>>>>>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct
>>>>>> rte_node
>>>>>>>> *node,
>>>>>>>>> +			void **objs, uint16_t nb_objs)
>>>>>>>>> +{
>>>>>>>>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3,
>>>> **pkts;
>>>>>>>>> +	rte_edge_t next0, next1, next2, next3, next_index;
>>>>>>>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>>>>>>>> +	struct rte_ether_hdr *eth_hdr;
>>>>>>>>> +	uint32_t ip0, ip1, ip2, ip3;
>>>>>>>>> +	void **to_next, **from;
>>>>>>>>> +	uint16_t last_spec = 0;
>>>>>>>>> +	uint16_t n_left_from;
>>>>>>>>> +	struct rte_lpm *lpm;
>>>>>>>>> +	uint16_t held = 0;
>>>>>>>>> +	uint32_t drop_nh;
>>>>>>>>> +	rte_xmm_t dst;
>>>>>>>>> +	__m128i dip; /* SSE register */
>>>>>>>>> +	int rc, i;
>>>>>>>>> +
>>>>>>>>> +	/* Speculative next */
>>>>>>>>> +	next_index =
>>>> RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>>>>>>>>> +	/* Drop node */
>>>>>>>>> +	drop_nh =
>>>>>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
>>>>>>>>> +
>>>>>>>>> +	/* Get socket specific LPM from ctx */
>>>>>>>>> +	lpm = *((struct rte_lpm **)node->ctx);
>>>>>>>>> +
>>>>>>>>> +	pkts = (struct rte_mbuf **)objs;
>>>>>>>>> +	from = objs;
>>>>>>>>> +	n_left_from = nb_objs;
>>>>>>>>
>>>>>>>> I doubt this initial prefetch of the first 4 packets has any
>> benefit.
>>>>>>>
>>>>>>> Ack will remove in v2 for x86.
>>>>>>>
>>>>>>>>
>>>>>>>>> +	if (n_left_from >= 4) {
>>>>>>>>> +		for (i = 0; i < 4; i++) {
>>>>>>>>> +
>>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
>>>>>>>>> +						       struct
>>>> rte_ether_hdr
>>>>>>>> *) +
>>>>>>>>> +				      1);
>>>>>>>>> +		}
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	/* Get stream for the speculated next node */
>>>>>>>>> +	to_next = rte_node_next_stream_get(graph, node,
>>>>>>>> next_index, nb_objs);
>>>>>>>>
>>>>>>>> Suggest you don't reuse the hand-unrolling optimization from
>>>> FD.io
>>>>>>>> VPP.
>>>>>>>> I have never found any performance benefit from them, and
>> they
>>>>>>>> make the code unnecessarily verbose.
>>>>>>>>
>>>>>>>
>>>>>>> How would be take the benefit of rte_lpm_lookupx4 without
>>>>>> unrolling the loop?.
>>>>>>> Also, in future if we are using rte_rib and fib with a CPU
>> supporting
>>>>>> wider SIMD we might
>>>>>>> need to unroll them further (AVX256 AND 512 currently
>>>>>> rte_lpm_lookup uses only 128bit
>>>>>>> since it is only uses SSE extension).
>>>>>>
>>>>>> Let the compiler do it for you, but using a constant vector length.
>>>>>> for (int i=0; i < 4; ++i) { ... }
>>>>>>
>>>>>
>>>>> Ok, I think I misunderstood the previous comment.
>>>>> It was only for the prefetches in the loop right?
>>>>
>>>>
>>>> no, it was for all the needless repetition.
>>>> hand-unrolling loops serve no purpose but to add verbosity.
>>>>
>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>> +	while (n_left_from >= 4) {
>>>>>>>>> +		/* Prefetch next-next mbufs */
>>>>>>>>> +		if (likely(n_left_from >= 11)) {
>>>>>>>>> +			rte_prefetch0(pkts[8]);
>>>>>>>>> +			rte_prefetch0(pkts[9]);
>>>>>>>>> +			rte_prefetch0(pkts[10]);
>>>>>>>>> +			rte_prefetch0(pkts[11]);
>>>>>>>>> +		}
>>>>>>>>> +
>>>>>>>>> +		/* Prefetch next mbuf data */
>>>>>>>>> +		if (likely(n_left_from >= 7)) {
>>>>>>>>> +
>>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
>>>>>>>>> +						       struct
>>>> rte_ether_hdr
>>>>>>>> *) +
>>>>>>>>> +				      1);
>>>>>>>>> +
>>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
>>>>>>>>> +						       struct
>>>> rte_ether_hdr
>>>>>>>> *) +
>>>>>>>>> +				      1);
>>>>>>>>> +
>>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
>>>>>>>>> +						       struct
>>>> rte_ether_hdr
>>>>>>>> *) +
>>>>>>>>> +				      1);
>>>>>>>>> +
>>>> 	rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
>>>>>>>>> +						       struct
>>>> rte_ether_hdr
>>>>>>>> *) +
>>>>>>>>> +				      1);
>>>>>>>>> +		}
>>>>>>>>> +
>>>>>>>>> +		mbuf0 = pkts[0];
>>>>>>>>> +		mbuf1 = pkts[1];
>>>>>>>>> +		mbuf2 = pkts[2];
>>>>>>>>> +		mbuf3 = pkts[3];
>>>>>>>>> +
>>>>>>>>> +		pkts += 4;
>>>>>>>>> +		n_left_from -= 4;
>>>>>>>>> +
>>>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>>>> rte_ether_hdr *);
>>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>>> +		ip0 = ipv4_hdr->dst_addr;
>>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>>>> ipv4_hdr-
>>>>>>>>> hdr_checksum;
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>>>> time_to_live;
>>>>>>>>> +
>>>>>>>>> +		/* Extract DIP of mbuf1 */
>>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct
>>>>>>>> rte_ether_hdr *);
>>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>>> +		ip1 = ipv4_hdr->dst_addr;
>>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->cksum =
>>>> ipv4_hdr-
>>>>>>>>> hdr_checksum;
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr-
>>>>>>>>> time_to_live;
>>>>>>>>> +
>>>>>>>>> +		/* Extract DIP of mbuf2 */
>>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct
>>>>>>>> rte_ether_hdr *);
>>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>>> +		ip2 = ipv4_hdr->dst_addr;
>>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->cksum =
>>>> ipv4_hdr-
>>>>>>>>> hdr_checksum;
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr-
>>>>>>>>> time_to_live;
>>>>>>>>> +
>>>>>>>>> +		/* Extract DIP of mbuf3 */
>>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct
>>>>>>>> rte_ether_hdr *);
>>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>>> +		ip3 = ipv4_hdr->dst_addr;
>>>>>>>>> +
>>>>>>>>> +		/* Prepare for lookup x4 */
>>>>>>>>> +		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
>>>>>>>>> +
>>>>>>>>> +		/* Byte swap 4 IPV4 addresses. */
>>>>>>>>> +		const __m128i bswap_mask = _mm_set_epi8(
>>>>>>>>> +			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1,
>>>> 2, 3);
>>>>>>>>> +		dip = _mm_shuffle_epi8(dip, bswap_mask);
>>>>>>>>> +
>>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->cksum =
>>>> ipv4_hdr-
>>>>>>>>> hdr_checksum;
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr-
>>>>>>>>> time_to_live;
>>>>>>>>> +
>>>>>>>>> +		/* Perform LPM lookup to get NH and next
>>>> node */
>>>>>>>>> +		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
>>>>>>>>> +
>>>>>>>>> +		/* Extract next node id and NH */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0]
>>>> &
>>>>>>>> 0xFFFF;
>>>>>>>>> +		next0 = (dst.u32[0] >> 16);
>>>>>>>>> +
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1]
>>>> &
>>>>>>>> 0xFFFF;
>>>>>>>>> +		next1 = (dst.u32[1] >> 16);
>>>>>>>>> +
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2]
>>>> &
>>>>>>>> 0xFFFF;
>>>>>>>>> +		next2 = (dst.u32[2] >> 16);
>>>>>>>>> +
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3]
>>>> &
>>>>>>>> 0xFFFF;
>>>>>>>>> +		next3 = (dst.u32[3] >> 16);
>>>>>>>>> +
>>>>>>>>> +		/* Enqueue four to next node */
>>>>>>>>> +		rte_edge_t fix_spec =
>>>>>>>>> +			(next_index ^ next0) | (next_index ^
>>>> next1) |
>>>>>>>>> +			(next_index ^ next2) | (next_index ^
>>>> next3);
>>>>>>>>> +
>>>>>>>>> +		if (unlikely(fix_spec)) {
>>>>>>>>> +			/* Copy things successfully speculated
>>>> till now
>>>>>>>> */
>>>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>>>> sizeof(from[0]));
>>>>>>>>> +			from += last_spec;
>>>>>>>>> +			to_next += last_spec;
>>>>>>>>> +			held += last_spec;
>>>>>>>>> +			last_spec = 0;
>>>>>>>>> +
>>>>>>>>> +			/* Next0 */
>>>>>>>>> +			if (next_index == next0) {
>>>>>>>>> +				to_next[0] = from[0];
>>>>>>>>> +				to_next++;
>>>>>>>>> +				held++;
>>>>>>>>> +			} else {
>>>>>>>>> +				rte_node_enqueue_x1(graph,
>>>> node,
>>>>>>>> next0,
>>>>>>>>> +						    from[0]);
>>>>>>>>> +			}
>>>>>>>>> +
>>>>>>>>> +			/* Next1 */
>>>>>>>>> +			if (next_index == next1) {
>>>>>>>>> +				to_next[0] = from[1];
>>>>>>>>> +				to_next++;
>>>>>>>>> +				held++;
>>>>>>>>> +			} else {
>>>>>>>>> +				rte_node_enqueue_x1(graph,
>>>> node,
>>>>>>>> next1,
>>>>>>>>> +						    from[1]);
>>>>>>>>> +			}
>>>>>>>>> +
>>>>>>>>> +			/* Next2 */
>>>>>>>>> +			if (next_index == next2) {
>>>>>>>>> +				to_next[0] = from[2];
>>>>>>>>> +				to_next++;
>>>>>>>>> +				held++;
>>>>>>>>> +			} else {
>>>>>>>>> +				rte_node_enqueue_x1(graph,
>>>> node,
>>>>>>>> next2,
>>>>>>>>> +						    from[2]);
>>>>>>>>> +			}
>>>>>>>>> +
>>>>>>>>> +			/* Next3 */
>>>>>>>>> +			if (next_index == next3) {
>>>>>>>>> +				to_next[0] = from[3];
>>>>>>>>> +				to_next++;
>>>>>>>>> +				held++;
>>>>>>>>> +			} else {
>>>>>>>>> +				rte_node_enqueue_x1(graph,
>>>> node,
>>>>>>>> next3,
>>>>>>>>> +						    from[3]);
>>>>>>>>> +			}
>>>>>>>>> +
>>>>>>>>> +			from += 4;
>>>>>>>>> +
>>>>>>>>> +		} else {
>>>>>>>>> +			last_spec += 4;
>>>>>>>>> +		}
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	while (n_left_from > 0) {
>>>>>>>>> +		uint32_t next_hop;
>>>>>>>>> +
>>>>>>>>> +		mbuf0 = pkts[0];
>>>>>>>>> +
>>>>>>>>> +		pkts += 1;
>>>>>>>>> +		n_left_from -= 1;
>>>>>>>>> +
>>>>>>>>> +		/* Extract DIP of mbuf0 */
>>>>>>>>> +		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct
>>>>>>>> rte_ether_hdr *);
>>>>>>>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>>>>>>>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->cksum =
>>>> ipv4_hdr-
>>>>>>>>> hdr_checksum;
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr-
>>>>>>>>> time_to_live;
>>>>>>>>> +
>>>>>>>>> +		rc = rte_lpm_lookup(lpm,
>>>> rte_be_to_cpu_32(ipv4_hdr-
>>>>>>>>> dst_addr),
>>>>>>>>> +				    &next_hop);
>>>>>>>>> +		next_hop = (rc == 0) ? next_hop : drop_nh;
>>>>>>>>> +
>>>>>>>>> +		rte_node_mbuf_priv1(mbuf0)->nh = next_hop
>>>> &
>>>>>>>> 0xFFFF;
>>>>>>>>> +		next0 = (next_hop >> 16);
>>>>>>>>> +
>>>>>>>>> +		if (unlikely(next_index ^ next0)) {
>>>>>>>>> +			/* Copy things successfully speculated
>>>> till now
>>>>>>>> */
>>>>>>>>> +			rte_memcpy(to_next, from, last_spec *
>>>>>>>> sizeof(from[0]));
>>>>>>>>> +			from += last_spec;
>>>>>>>>> +			to_next += last_spec;
>>>>>>>>> +			held += last_spec;
>>>>>>>>> +			last_spec = 0;
>>>>>>>>> +
>>>>>>>>> +			rte_node_enqueue_x1(graph, node,
>>>> next0,
>>>>>>>> from[0]);
>>>>>>>>> +			from += 1;
>>>>>>>>> +		} else {
>>>>>>>>> +			last_spec += 1;
>>>>>>>>> +		}
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	/* !!! Home run !!! */
>>>>>>>>> +	if (likely(last_spec == nb_objs)) {
>>>>>>>>> +		rte_node_next_stream_move(graph, node,
>>>>>>>> next_index);
>>>>>>>>> +		return nb_objs;
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	held += last_spec;
>>>>>>>>> +	/* Copy things successfully speculated till now */
>>>>>>>>> +	rte_memcpy(to_next, from, last_spec *
>>>> sizeof(from[0]));
>>>>>>>>> +	rte_node_next_stream_put(graph, node, next_index,
>>>> held);
>>>>>>>>> +
>>>>>>>>> +	return nb_objs;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>  #else
>>>>>>>>>
>>>>>>>>>  static uint16_t
>>>>>>>>>
  

Patch

diff --git a/lib/librte_node/ip4_lookup.c b/lib/librte_node/ip4_lookup.c
index d7fcd1158..c003e9c91 100644
--- a/lib/librte_node/ip4_lookup.c
+++ b/lib/librte_node/ip4_lookup.c
@@ -264,6 +264,251 @@  ip4_lookup_node_process(struct rte_graph *graph, struct rte_node *node,
 	return nb_objs;
 }
 
+#elif defined(RTE_ARCH_X86)
+
+/* X86 SSE */
+static uint16_t
+ip4_lookup_node_process(struct rte_graph *graph, struct rte_node *node,
+			void **objs, uint16_t nb_objs)
+{
+	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
+	rte_edge_t next0, next1, next2, next3, next_index;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	uint32_t ip0, ip1, ip2, ip3;
+	void **to_next, **from;
+	uint16_t last_spec = 0;
+	uint16_t n_left_from;
+	struct rte_lpm *lpm;
+	uint16_t held = 0;
+	uint32_t drop_nh;
+	rte_xmm_t dst;
+	__m128i dip; /* SSE register */
+	int rc, i;
+
+	/* Speculative next */
+	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
+	/* Drop node */
+	drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
+
+	/* Get socket specific LPM from ctx */
+	lpm = *((struct rte_lpm **)node->ctx);
+
+	pkts = (struct rte_mbuf **)objs;
+	from = objs;
+	n_left_from = nb_objs;
+
+	if (n_left_from >= 4) {
+		for (i = 0; i < 4; i++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i],
+						       struct rte_ether_hdr *) +
+				      1);
+		}
+	}
+
+	/* Get stream for the speculated next node */
+	to_next = rte_node_next_stream_get(graph, node, next_index, nb_objs);
+	while (n_left_from >= 4) {
+		/* Prefetch next-next mbufs */
+		if (likely(n_left_from >= 11)) {
+			rte_prefetch0(pkts[8]);
+			rte_prefetch0(pkts[9]);
+			rte_prefetch0(pkts[10]);
+			rte_prefetch0(pkts[11]);
+		}
+
+		/* Prefetch next mbuf data */
+		if (likely(n_left_from >= 7)) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[4],
+						       struct rte_ether_hdr *) +
+				      1);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[5],
+						       struct rte_ether_hdr *) +
+				      1);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[6],
+						       struct rte_ether_hdr *) +
+				      1);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[7],
+						       struct rte_ether_hdr *) +
+				      1);
+		}
+
+		mbuf0 = pkts[0];
+		mbuf1 = pkts[1];
+		mbuf2 = pkts[2];
+		mbuf3 = pkts[3];
+
+		pkts += 4;
+		n_left_from -= 4;
+
+		/* Extract DIP of mbuf0 */
+		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip0 = ipv4_hdr->dst_addr;
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr->hdr_checksum;
+		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr->time_to_live;
+
+		/* Extract DIP of mbuf1 */
+		eth_hdr = rte_pktmbuf_mtod(mbuf1, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip1 = ipv4_hdr->dst_addr;
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		rte_node_mbuf_priv1(mbuf1)->cksum = ipv4_hdr->hdr_checksum;
+		rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr->time_to_live;
+
+		/* Extract DIP of mbuf2 */
+		eth_hdr = rte_pktmbuf_mtod(mbuf2, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip2 = ipv4_hdr->dst_addr;
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		rte_node_mbuf_priv1(mbuf2)->cksum = ipv4_hdr->hdr_checksum;
+		rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr->time_to_live;
+
+		/* Extract DIP of mbuf3 */
+		eth_hdr = rte_pktmbuf_mtod(mbuf3, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip3 = ipv4_hdr->dst_addr;
+
+		/* Prepare for lookup x4 */
+		dip = _mm_set_epi32(ip3, ip2, ip1, ip0);
+
+		/* Byte swap 4 IPV4 addresses. */
+		const __m128i bswap_mask = _mm_set_epi8(
+			12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+		dip = _mm_shuffle_epi8(dip, bswap_mask);
+
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		rte_node_mbuf_priv1(mbuf3)->cksum = ipv4_hdr->hdr_checksum;
+		rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr->time_to_live;
+
+		/* Perform LPM lookup to get NH and next node */
+		rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh);
+
+		/* Extract next node id and NH */
+		rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] & 0xFFFF;
+		next0 = (dst.u32[0] >> 16);
+
+		rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] & 0xFFFF;
+		next1 = (dst.u32[1] >> 16);
+
+		rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] & 0xFFFF;
+		next2 = (dst.u32[2] >> 16);
+
+		rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] & 0xFFFF;
+		next3 = (dst.u32[3] >> 16);
+
+		/* Enqueue four to next node */
+		rte_edge_t fix_spec =
+			(next_index ^ next0) | (next_index ^ next1) |
+			(next_index ^ next2) | (next_index ^ next3);
+
+		if (unlikely(fix_spec)) {
+			/* Copy things successfully speculated till now */
+			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
+			from += last_spec;
+			to_next += last_spec;
+			held += last_spec;
+			last_spec = 0;
+
+			/* Next0 */
+			if (next_index == next0) {
+				to_next[0] = from[0];
+				to_next++;
+				held++;
+			} else {
+				rte_node_enqueue_x1(graph, node, next0,
+						    from[0]);
+			}
+
+			/* Next1 */
+			if (next_index == next1) {
+				to_next[0] = from[1];
+				to_next++;
+				held++;
+			} else {
+				rte_node_enqueue_x1(graph, node, next1,
+						    from[1]);
+			}
+
+			/* Next2 */
+			if (next_index == next2) {
+				to_next[0] = from[2];
+				to_next++;
+				held++;
+			} else {
+				rte_node_enqueue_x1(graph, node, next2,
+						    from[2]);
+			}
+
+			/* Next3 */
+			if (next_index == next3) {
+				to_next[0] = from[3];
+				to_next++;
+				held++;
+			} else {
+				rte_node_enqueue_x1(graph, node, next3,
+						    from[3]);
+			}
+
+			from += 4;
+
+		} else {
+			last_spec += 4;
+		}
+	}
+
+	while (n_left_from > 0) {
+		uint32_t next_hop;
+
+		mbuf0 = pkts[0];
+
+		pkts += 1;
+		n_left_from -= 1;
+
+		/* Extract DIP of mbuf0 */
+		eth_hdr = rte_pktmbuf_mtod(mbuf0, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		rte_node_mbuf_priv1(mbuf0)->cksum = ipv4_hdr->hdr_checksum;
+		rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr->time_to_live;
+
+		rc = rte_lpm_lookup(lpm, rte_be_to_cpu_32(ipv4_hdr->dst_addr),
+				    &next_hop);
+		next_hop = (rc == 0) ? next_hop : drop_nh;
+
+		rte_node_mbuf_priv1(mbuf0)->nh = next_hop & 0xFFFF;
+		next0 = (next_hop >> 16);
+
+		if (unlikely(next_index ^ next0)) {
+			/* Copy things successfully speculated till now */
+			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
+			from += last_spec;
+			to_next += last_spec;
+			held += last_spec;
+			last_spec = 0;
+
+			rte_node_enqueue_x1(graph, node, next0, from[0]);
+			from += 1;
+		} else {
+			last_spec += 1;
+		}
+	}
+
+	/* !!! Home run !!! */
+	if (likely(last_spec == nb_objs)) {
+		rte_node_next_stream_move(graph, node, next_index);
+		return nb_objs;
+	}
+
+	held += last_spec;
+	/* Copy things successfully speculated till now */
+	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
+	rte_node_next_stream_put(graph, node, next_index, held);
+
+	return nb_objs;
+}
+
 #else
 
 static uint16_t