[v1,04/12] node: add process callback for IP4 FIB

Message ID 20250415121052.1497155-5-adwivedi@marvell.com (mailing list archive)
State New
Delegated to: Thomas Monjalon
Headers
Series add lookup fib nodes in graph library |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Ankur Dwivedi April 15, 2025, 12:10 p.m. UTC
Adds the process callback function for ip4_lookup_fib node.

Signed-off-by: Ankur Dwivedi <adwivedi@marvell.com>
---
 lib/node/ip4_lookup_fib.c | 164 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
  

Comments

Nitin Saxena April 16, 2025, 7:54 a.m. UTC | #1
Hi Ankur,

Same comments apply to IPv6 nodes as well. See for ip4 lookup comments

Thanks,
Nitin

On Tue, Apr 15, 2025 at 6:20 PM Ankur Dwivedi <adwivedi@marvell.com> wrote:
>
> Adds the process callback function for ip4_lookup_fib node.
>
> Signed-off-by: Ankur Dwivedi <adwivedi@marvell.com>
> ---
>  lib/node/ip4_lookup_fib.c | 164 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 164 insertions(+)
>
> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
> index e87864e672..c535b191f8 100644
> --- a/lib/node/ip4_lookup_fib.c
> +++ b/lib/node/ip4_lookup_fib.c
> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main ip4_lookup_fib_nm;
>  #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
>         (((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
>
> +static uint16_t
> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node *node, void **objs,
> +                           uint16_t nb_objs)
> +{
> +       struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
> +       struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
> +       const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
> +       struct rte_ipv4_hdr *ipv4_hdr;
> +       uint64_t next_hop[nb_objs];
> +       uint16_t lookup_err = 0;
> +       void **to_next, **from;
> +       uint16_t last_spec = 0;
> +       rte_edge_t next_index;
> +       uint16_t n_left_from;
> +       uint32_t ip[nb_objs];
> +       uint16_t held = 0;
> +       uint32_t drop_nh;
> +       uint16_t next;
> +       int i, rc;
> +
> +       /* Speculative next */
> +       next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;

Is it possible if we add next_edge in node->ctx? Save next_index in
node->init() function and at the end of process() function for
better speculative performance

Also in general, this function is assuming packets are being forwarded
to rewrite node but there be can other paths as well like
- LOCAL
- PUNT etc.

So let next_hop returned from rte_fib_lookup_bulk() determine the
next_edge (even pkt_drop node). Control plane feeds next_edge in 8B
next_hop which I defined in other patch and also below
(rte_ip4_lookup_fib_next_hop_t)


> +       /* Drop node */
> +       drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;

Let drop be determined from next_hop returned from rte_ip4_lookup_fib_next_hop_t
Control plane feeds default next_hop as part of setup_fib() as follows
struct {
   struct {
     uint32_t next_hop_od;
    uint16_t next_edge;
    uint16_t reserved;
   };
   uint64_t u4;
} rte_ip4_lookpu_fib_next_hop_t;

default_nh = {,next = RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP}; which is
programmed in setup_fib
> +
> +       pkts = (struct rte_mbuf **)objs;
> +       from = objs;
> +       n_left_from = nb_objs;
> +
> +       /* Get stream for the speculated next node */
> +       to_next = rte_node_next_stream_get(graph, node, next_index, nb_objs);
> +
> +       for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i += OBJS_PER_CLINE)
> +               rte_prefetch0(&objs[i]);
> +
> +#if RTE_GRAPH_BURST_SIZE > 64
> +       for (i = 0; i < 4 && i < n_left_from; i++) {
> +               rte_prefetch0(pkts[i]);
> +               rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
> +                                       sizeof(struct rte_ether_hdr)));
> +       }
> +#endif
> +
> +       i = 0;
> +       while (n_left_from >= 4) {
> +#if RTE_GRAPH_BURST_SIZE > 64
> +               if (likely(n_left_from > 7)) {
> +                       rte_prefetch0(pkts[4]);
> +                       rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void *,
> +                                       sizeof(struct rte_ether_hdr)));
> +                       rte_prefetch0(pkts[5]);
> +                       rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void *,
> +                                       sizeof(struct rte_ether_hdr)));
> +                       rte_prefetch0(pkts[6]);
> +                       rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void *,
> +                                       sizeof(struct rte_ether_hdr)));
> +                       rte_prefetch0(pkts[7]);
> +                       rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void *,
> +                                       sizeof(struct rte_ether_hdr)));
> +               }
> +#endif
> +
> +               mbuf0 = pkts[0];
> +               mbuf1 = pkts[1];
> +               mbuf2 = pkts[2];
> +               mbuf3 = pkts[3];
> +               pkts += 4;
> +               n_left_from -= 4;
> +               /* Extract DIP of mbuf0 */
> +               ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
> +                               sizeof(struct rte_ether_hdr));
> +               /* Extract cksum, ttl as ipv4 hdr is in cache */
> +               node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +               node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +               ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +               /* Extract DIP of mbuf1 */
> +               ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct rte_ipv4_hdr *,
> +                               sizeof(struct rte_ether_hdr));
> +               /* Extract cksum, ttl as ipv4 hdr is in cache */
> +               node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +               node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +               ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +               /* Extract DIP of mbuf2 */
> +               ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct rte_ipv4_hdr *,
> +                               sizeof(struct rte_ether_hdr));
> +               /* Extract cksum, ttl as ipv4 hdr is in cache */
> +               node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +               node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +               ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +               /* Extract DIP of mbuf3 */
> +               ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct rte_ipv4_hdr *,
> +                               sizeof(struct rte_ether_hdr));
> +
> +               /* Extract cksum, ttl as ipv4 hdr is in cache */
> +               node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +               node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +               ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +       }
> +       while (n_left_from > 0) {
> +               mbuf0 = pkts[0];
> +               pkts += 1;
> +               n_left_from -= 1;
> +
> +               /* Extract DIP of mbuf0 */
> +               ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
> +                               sizeof(struct rte_ether_hdr));
> +               /* Extract cksum, ttl as ipv4 hdr is in cache */
> +               node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +               node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +               ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +       }
> +
> +       rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
> +       if (unlikely(rc != 0))
> +               return 0;
> +
> +       for (i = 0; i < nb_objs; i++) {
> +               if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {

This check can be removed since we have programmed drop node in default_nh.

> +                       next_hop[i] = drop_nh;
> +                       lookup_err += 1;
> +               }
> +
> +               mbuf0 = (struct rte_mbuf *)objs[i];
> +               node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i];
> +               next = (uint16_t)(next_hop[i] >> 16);

Please get next and next_hop from next_hop itself

      node_mbuf_priv1(mbuf0, dyn)->nh =
((rte_ip4_lookup_fib_next_hop_t *)next_hop[i])->next_edge
      next = ((rte_ip4_lookup_fib_next_hop_t *)next_hop[i])->next_hop_id;
> +
> +               if (unlikely(next_index ^ next)) {
> +                       /* Copy things successfully speculated till now */
> +                       rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +                       from += last_spec;
> +                       to_next += last_spec;
> +                       held += last_spec;
> +                       last_spec = 0;
> +
> +                       rte_node_enqueue_x1(graph, node, next, from[0]);
> +                       from += 1;
> +               } else {
> +                       last_spec += 1;
> +               }
> +       }
> +
> +       /* !!! Home run !!! */
> +       if (likely(last_spec == nb_objs)) {
> +               rte_node_next_stream_move(graph, node, next_index);
> +               return nb_objs;
> +       }
> +
> +       NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err);
> +       held += last_spec;
> +       rte_memcpy(to_next, from, last_spec * sizeof(from[0]));

Save current next_index in node->ctx to make speculation work in next
iteration.
> +       rte_node_next_stream_put(graph, node, next_index, held);
> +
> +       return nb_objs;
> +}
> +
>  RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
>  int
>  rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t next_hop,
> @@ -147,6 +310,7 @@ static struct rte_node_xstats ip4_lookup_fib_xstats = {
>  };
>
>  static struct rte_node_register ip4_lookup_fib_node = {
> +       .process = ip4_lookup_fib_node_process,
>         .name = "ip4_lookup_fib",
>
>         .init = ip4_lookup_fib_node_init,
> --
> 2.25.1
>
  
Medvedkin, Vladimir April 16, 2025, 12:54 p.m. UTC | #2
Hi Ankur,

On 15/04/2025 13:10, Ankur Dwivedi wrote:
> Adds the process callback function for ip4_lookup_fib node.
>
> Signed-off-by: Ankur Dwivedi <adwivedi@marvell.com>
> ---
>   lib/node/ip4_lookup_fib.c | 164 ++++++++++++++++++++++++++++++++++++++
>   1 file changed, 164 insertions(+)
>
> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
> index e87864e672..c535b191f8 100644
> --- a/lib/node/ip4_lookup_fib.c
> +++ b/lib/node/ip4_lookup_fib.c
> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main ip4_lookup_fib_nm;
>   #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
>   	(((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
>   
> +static uint16_t
> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node *node, void **objs,
> +			    uint16_t nb_objs)
> +{
> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
> +	struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
> +	const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	uint64_t next_hop[nb_objs];
> +	uint16_t lookup_err = 0;
> +	void **to_next, **from;
> +	uint16_t last_spec = 0;
> +	rte_edge_t next_index;
> +	uint16_t n_left_from;
> +	uint32_t ip[nb_objs];
> +	uint16_t held = 0;
> +	uint32_t drop_nh;
> +	uint16_t next;
> +	int i, rc;
> +
> +	/* Speculative next */
> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
> +	/* Drop node */
> +	drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
> +
> +	pkts = (struct rte_mbuf **)objs;
> +	from = objs;
> +	n_left_from = nb_objs;
> +
> +	/* Get stream for the speculated next node */
> +	to_next = rte_node_next_stream_get(graph, node, next_index, nb_objs);
> +
> +	for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i += OBJS_PER_CLINE)
> +		rte_prefetch0(&objs[i]);

Does this prefetching loop make any sense? Unless objs are not passed 
across threads this array likely to be in the cache already.

And if objs are passed across threads, then why do you start prefetching 
from the next cache line instead of the first, and why don't you stop at 
nb_objs?

> +
> +#if RTE_GRAPH_BURST_SIZE > 64
> +	for (i = 0; i < 4 && i < n_left_from; i++) {
> +		rte_prefetch0(pkts[i]);
> +		rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
> +					sizeof(struct rte_ether_hdr)));

This construction does not make sense to me. Same as similar 
constructions below

The second prefetch has memory dependency of the data, that will be 
prefetched by the first one. Does removing this prefetch affects 
performance?

> +	}
> +#endif
> +
> +	i = 0;
> +	while (n_left_from >= 4) {
> +#if RTE_GRAPH_BURST_SIZE > 64
> +		if (likely(n_left_from > 7)) {
> +			rte_prefetch0(pkts[4]);
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void *,
> +					sizeof(struct rte_ether_hdr)));
> +			rte_prefetch0(pkts[5]);
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void *,
> +					sizeof(struct rte_ether_hdr)));
> +			rte_prefetch0(pkts[6]);
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void *,
> +					sizeof(struct rte_ether_hdr)));
> +			rte_prefetch0(pkts[7]);
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void *,
> +					sizeof(struct rte_ether_hdr)));
> +		}
> +#endif
> +
> +		mbuf0 = pkts[0];
> +		mbuf1 = pkts[1];
> +		mbuf2 = pkts[2];
> +		mbuf3 = pkts[3];
> +		pkts += 4;
> +		n_left_from -= 4;
> +		/* Extract DIP of mbuf0 */
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
> +				sizeof(struct rte_ether_hdr));
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +		/* Extract DIP of mbuf1 */
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct rte_ipv4_hdr *,
> +				sizeof(struct rte_ether_hdr));
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +		node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +		/* Extract DIP of mbuf2 */
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct rte_ipv4_hdr *,
> +				sizeof(struct rte_ether_hdr));
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +		node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +
> +		/* Extract DIP of mbuf3 */
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct rte_ipv4_hdr *,
> +				sizeof(struct rte_ether_hdr));
> +
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +		node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +	}
> +	while (n_left_from > 0) {
> +		mbuf0 = pkts[0];
> +		pkts += 1;
> +		n_left_from -= 1;
> +
> +		/* Extract DIP of mbuf0 */
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
> +				sizeof(struct rte_ether_hdr));
> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
> +		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
> +		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> +
> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +	}
> +
> +	rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
> +	if (unlikely(rc != 0))
> +		return 0;
> +
> +	for (i = 0; i < nb_objs; i++) {
> +		if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {
> +			next_hop[i] = drop_nh;
maybe it is worth just do define FIB_DEFAULT_NH as a drop_nh vaue to 
omit these next_hop reassignments?
> +			lookup_err += 1;
> +		}
> +
> +		mbuf0 = (struct rte_mbuf *)objs[i];
> +		node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i];
> +		next = (uint16_t)(next_hop[i] >> 16);
> +
> +		if (unlikely(next_index ^ next)) {
> +			/* Copy things successfully speculated till now */
> +			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +			from += last_spec;
> +			to_next += last_spec;
> +			held += last_spec;
> +			last_spec = 0;
> +
> +			rte_node_enqueue_x1(graph, node, next, from[0]);
> +			from += 1;
> +		} else {
> +			last_spec += 1;
> +		}
> +	}
> +
> +	/* !!! Home run !!! */
> +	if (likely(last_spec == nb_objs)) {
> +		rte_node_next_stream_move(graph, node, next_index);
> +		return nb_objs;
> +	}
> +
> +	NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err);
> +	held += last_spec;
> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> +	rte_node_next_stream_put(graph, node, next_index, held);
> +
> +	return nb_objs;
> +}
> +
>   RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
>   int
>   rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t next_hop,
> @@ -147,6 +310,7 @@ static struct rte_node_xstats ip4_lookup_fib_xstats = {
>   };
>   
>   static struct rte_node_register ip4_lookup_fib_node = {
> +	.process = ip4_lookup_fib_node_process,
>   	.name = "ip4_lookup_fib",
>   
>   	.init = ip4_lookup_fib_node_init,
  
Ankur Dwivedi April 18, 2025, 7:38 a.m. UTC | #3
Hi Vladimir,
>> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
>> index e87864e672..c535b191f8 100644
>> --- a/lib/node/ip4_lookup_fib.c
>> +++ b/lib/node/ip4_lookup_fib.c
>> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main
>ip4_lookup_fib_nm;
>>   #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
>>   	(((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
>>
>> +static uint16_t
>> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node
>*node, void **objs,
>> +			    uint16_t nb_objs)
>> +{
>> +	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>> +	struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
>> +	const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
>> +	struct rte_ipv4_hdr *ipv4_hdr;
>> +	uint64_t next_hop[nb_objs];
>> +	uint16_t lookup_err = 0;
>> +	void **to_next, **from;
>> +	uint16_t last_spec = 0;
>> +	rte_edge_t next_index;
>> +	uint16_t n_left_from;
>> +	uint32_t ip[nb_objs];
>> +	uint16_t held = 0;
>> +	uint32_t drop_nh;
>> +	uint16_t next;
>> +	int i, rc;
>> +
>> +	/* Speculative next */
>> +	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>> +	/* Drop node */
>> +	drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) <<
>16;
>> +
>> +	pkts = (struct rte_mbuf **)objs;
>> +	from = objs;
>> +	n_left_from = nb_objs;
>> +
>> +	/* Get stream for the speculated next node */
>> +	to_next = rte_node_next_stream_get(graph, node, next_index,
>> +nb_objs);
>> +
>> +	for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i +=
>OBJS_PER_CLINE)
>> +		rte_prefetch0(&objs[i]);
>
>Does this prefetching loop make any sense? Unless objs are not passed across
>threads this array likely to be in the cache already.
>
>And if objs are passed across threads, then why do you start prefetching from
>the next cache line instead of the first, and why don't you stop at nb_objs?
For example if cache size is 64 bytes. Then there will be 8 pointers to mbuf per cache line (if address size is 8 bytes). If nb_objs is 256, then the remaining pointers needs to be prefetched to cache. This loop helps to prefetch nb_objs pointers to cache. 
The loop can be changed to stop at nb_objs.
for (i = OBJS_PER_CLINE; i < nb_objs; i += OBJS_PER_CLINE) { }

>
>> +
>> +#if RTE_GRAPH_BURST_SIZE > 64
>> +	for (i = 0; i < 4 && i < n_left_from; i++) {
>> +		rte_prefetch0(pkts[i]);
>> +		rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
>> +					sizeof(struct rte_ether_hdr)));
>
>This construction does not make sense to me. Same as similar constructions
>below
>
>The second prefetch has memory dependency of the data, that will be
>prefetched by the first one. Does removing this prefetch affects performance?
The first prefetches the data at mbuf address. The second prefetch is for the address containing ipv4 header. Both can be in separate cache lines, as ipv4 starts at mbuf->buf_addr + mbuf->data_off + sizeof(ethernet header). data_off is generally 64 bytes or 128 bytes depending on cache line size.
>
>> +	}
>> +#endif
>> +
>> +	i = 0;
>> +	while (n_left_from >= 4) {
>> +#if RTE_GRAPH_BURST_SIZE > 64
>> +		if (likely(n_left_from > 7)) {
>> +			rte_prefetch0(pkts[4]);
>> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void
>*,
>> +					sizeof(struct rte_ether_hdr)));
>> +			rte_prefetch0(pkts[5]);
>> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void
>*,
>> +					sizeof(struct rte_ether_hdr)));
>> +			rte_prefetch0(pkts[6]);
>> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void
>*,
>> +					sizeof(struct rte_ether_hdr)));
>> +			rte_prefetch0(pkts[7]);
>> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void
>*,
>> +					sizeof(struct rte_ether_hdr)));
>> +		}
>> +#endif
>> +
>> +		mbuf0 = pkts[0];
>> +		mbuf1 = pkts[1];
>> +		mbuf2 = pkts[2];
>> +		mbuf3 = pkts[3];
>> +		pkts += 4;
>> +		n_left_from -= 4;
>> +		/* Extract DIP of mbuf0 */
>> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
>rte_ipv4_hdr *,
>> +				sizeof(struct rte_ether_hdr));
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
>> +
>> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>> +
>> +		/* Extract DIP of mbuf1 */
>> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct
>rte_ipv4_hdr *,
>> +				sizeof(struct rte_ether_hdr));
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live;
>> +
>> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>> +
>> +		/* Extract DIP of mbuf2 */
>> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct
>rte_ipv4_hdr *,
>> +				sizeof(struct rte_ether_hdr));
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live;
>> +
>> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>> +
>> +		/* Extract DIP of mbuf3 */
>> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct
>rte_ipv4_hdr *,
>> +				sizeof(struct rte_ether_hdr));
>> +
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live;
>> +
>> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>> +	}
>> +	while (n_left_from > 0) {
>> +		mbuf0 = pkts[0];
>> +		pkts += 1;
>> +		n_left_from -= 1;
>> +
>> +		/* Extract DIP of mbuf0 */
>> +		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
>rte_ipv4_hdr *,
>> +				sizeof(struct rte_ether_hdr));
>> +		/* Extract cksum, ttl as ipv4 hdr is in cache */
>> +		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
>>hdr_checksum;
>> +		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
>> +
>> +		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>> +	}
>> +
>> +	rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
>> +	if (unlikely(rc != 0))
>> +		return 0;
>> +
>> +	for (i = 0; i < nb_objs; i++) {
>> +		if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {
>> +			next_hop[i] = drop_nh;
>maybe it is worth just do define FIB_DEFAULT_NH as a drop_nh vaue to omit
>these next_hop reassignments?
Ack.
>> +			lookup_err += 1;
>> +		}
>> +
>> +		mbuf0 = (struct rte_mbuf *)objs[i];
>> +		node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i];
>> +		next = (uint16_t)(next_hop[i] >> 16);
>> +
>> +		if (unlikely(next_index ^ next)) {
>> +			/* Copy things successfully speculated till now */
>> +			rte_memcpy(to_next, from, last_spec *
>sizeof(from[0]));
>> +			from += last_spec;
>> +			to_next += last_spec;
>> +			held += last_spec;
>> +			last_spec = 0;
>> +
>> +			rte_node_enqueue_x1(graph, node, next, from[0]);
>> +			from += 1;
>> +		} else {
>> +			last_spec += 1;
>> +		}
>> +	}
>> +
>> +	/* !!! Home run !!! */
>> +	if (likely(last_spec == nb_objs)) {
>> +		rte_node_next_stream_move(graph, node, next_index);
>> +		return nb_objs;
>> +	}
>> +
>> +	NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err);
>> +	held += last_spec;
>> +	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>> +	rte_node_next_stream_put(graph, node, next_index, held);
>> +
>> +	return nb_objs;
>> +}
>> +
>>   RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
>>   int
>>   rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t
>> next_hop, @@ -147,6 +310,7 @@ static struct rte_node_xstats
>ip4_lookup_fib_xstats = {
>>   };
>>
>>   static struct rte_node_register ip4_lookup_fib_node = {
>> +	.process = ip4_lookup_fib_node_process,
>>   	.name = "ip4_lookup_fib",
>>
>>   	.init = ip4_lookup_fib_node_init,
>
>--
>Regards,
>Vladimir
  
Vladimir Medvedkin April 19, 2025, 6:14 p.m. UTC | #4
Hi Ankur,

пт, 18 апр. 2025 г. в 15:45, Ankur Dwivedi <adwivedi@marvell.com>:

>
> Hi Vladimir,
> >> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
> >> index e87864e672..c535b191f8 100644
> >> --- a/lib/node/ip4_lookup_fib.c
> >> +++ b/lib/node/ip4_lookup_fib.c
> >> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main
> >ip4_lookup_fib_nm;
> >>   #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
> >>      (((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
> >>
> >> +static uint16_t
> >> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node
> >*node, void **objs,
> >> +                        uint16_t nb_objs)
> >> +{
> >> +    struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
> >> +    struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
> >> +    const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
> >> +    struct rte_ipv4_hdr *ipv4_hdr;
> >> +    uint64_t next_hop[nb_objs];
> >> +    uint16_t lookup_err = 0;
> >> +    void **to_next, **from;
> >> +    uint16_t last_spec = 0;
> >> +    rte_edge_t next_index;
> >> +    uint16_t n_left_from;
> >> +    uint32_t ip[nb_objs];
> >> +    uint16_t held = 0;
> >> +    uint32_t drop_nh;
> >> +    uint16_t next;
> >> +    int i, rc;
> >> +
> >> +    /* Speculative next */
> >> +    next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
> >> +    /* Drop node */
> >> +    drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) <<
> >16;
> >> +
> >> +    pkts = (struct rte_mbuf **)objs;
> >> +    from = objs;
> >> +    n_left_from = nb_objs;
> >> +
> >> +    /* Get stream for the speculated next node */
> >> +    to_next = rte_node_next_stream_get(graph, node, next_index,
> >> +nb_objs);
> >> +
> >> +    for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i +=
> >OBJS_PER_CLINE)
> >> +            rte_prefetch0(&objs[i]);
> >
> >Does this prefetching loop make any sense? Unless objs are not passed
> across
> >threads this array likely to be in the cache already.
> >
> >And if objs are passed across threads, then why do you start prefetching
> from
> >the next cache line instead of the first, and why don't you stop at
> nb_objs?
> For example if cache size is 64 bytes. Then there will be 8 pointers to
> mbuf per cache line (if address size is 8 bytes). If nb_objs is 256, then
> the remaining pointers needs to be prefetched to cache. This loop helps to
> prefetch nb_objs pointers to cache.
>

My comment was about necessity. I'll rephrase and enumerate my questions to
be more clear:
1. Are mbuf pointers contained in the objs array not in cache? My
assumptions here are:
    - If you run graph in RTC mode, objs array is very likely already be in
your L1 cache (since some previous node/nodes just put packets there)
    - In dispatch mode it doesn't make much sense to run this lookup node in
an another thread separately from the remaining nodes processing IPv4

2. if you still thing this prefetching loop is required here, then:

> The loop can be changed to stop at nb_objs.
> for (i = OBJS_PER_CLINE; i < nb_objs; i += OBJS_PER_CLINE) { }
>
>
  why you start from the OBJS_PER_CLINE (second cache line of the objs
array)

3. "i < nb_objs". Should be "i < RTE_ALIGN_CEIL(nb_objs, OBJS_PER_CLINE) /
OBJS_PER_CLINE"

>
> >> +
> >> +#if RTE_GRAPH_BURST_SIZE > 64
> >> +    for (i = 0; i < 4 && i < n_left_from; i++) {
> >> +            rte_prefetch0(pkts[i]);
> >> +            rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
> >> +                                    sizeof(struct rte_ether_hdr)));
> >
> >This construction does not make sense to me. Same as similar constructions
> >below
> >
> >The second prefetch has memory dependency of the data, that will be
> >prefetched by the first one. Does removing this prefetch affects
> performance?
> The first prefetches the data at mbuf address. The second prefetch is for
> the address containing ipv4 header. Both can be in separate cache lines, as
> ipv4 starts at mbuf->buf_addr + mbuf->data_off + sizeof(ethernet header).
> data_off is generally 64 bytes or 128 bytes depending on cache line size.
> >
>

Indeed, both of them are in separate cache lines. But my point here is
about data dependency for the second prefetch. In order to issue the
prefetch instruction for the cache line containing the ipv4 header you must
know the address. You calculate this address by:
rte_pktmbuf_mtod_offset(pkts[i], void *, sizeof(struct rte_ether_hdr))
rte_pktmbuf_mtod_offset is accessing mbuf->buf_addr and mbuf->data_off , as
you mentioned. But, these fields are not in your L1 cache at the moment,
because you just asked to prefetch them with the previous instruction.
So my suggestion here would be to look at how it is done in
ipv4_lookup_sse.c:ip4_lookup_node_process_vec()
Simplifying, in a run loop it prefetches mbuf + 2, then prefetches CL with
the v4 header of the mbuf + 1 (assuming in a previous invocation mbuf CL
was already fetched into L1), and finally does processing of the mbuf + 0
(again, assuming the previous iteration ipv4 CL was fetched).


> >> +    }
> >> +#endif
> >> +
> >> +    i = 0;
> >> +    while (n_left_from >= 4) {
> >> +#if RTE_GRAPH_BURST_SIZE > 64
> >> +            if (likely(n_left_from > 7)) {
> >> +                    rte_prefetch0(pkts[4]);
> >> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void
> >*,
> >> +                                    sizeof(struct rte_ether_hdr)));
> >> +                    rte_prefetch0(pkts[5]);
> >> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void
> >*,
> >> +                                    sizeof(struct rte_ether_hdr)));
> >> +                    rte_prefetch0(pkts[6]);
> >> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void
> >*,
> >> +                                    sizeof(struct rte_ether_hdr)));
> >> +                    rte_prefetch0(pkts[7]);
> >> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void
> >*,
> >> +                                    sizeof(struct rte_ether_hdr)));
> >> +            }
> >> +#endif
> >> +
> >> +            mbuf0 = pkts[0];
> >> +            mbuf1 = pkts[1];
> >> +            mbuf2 = pkts[2];
> >> +            mbuf3 = pkts[3];
> >> +            pkts += 4;
> >> +            n_left_from -= 4;
> >> +            /* Extract DIP of mbuf0 */
> >> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
> >rte_ipv4_hdr *,
> >> +                            sizeof(struct rte_ether_hdr));
> >> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
> >> +            node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
> >>hdr_checksum;
> >> +            node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> >> +
> >> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> >> +
> >> +            /* Extract DIP of mbuf1 */
> >> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct
> >rte_ipv4_hdr *,
> >> +                            sizeof(struct rte_ether_hdr));
> >> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
> >> +            node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr-
> >>hdr_checksum;
> >> +            node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live;
> >> +
> >> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> >> +
> >> +            /* Extract DIP of mbuf2 */
> >> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct
> >rte_ipv4_hdr *,
> >> +                            sizeof(struct rte_ether_hdr));
> >> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
> >> +            node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr-
> >>hdr_checksum;
> >> +            node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live;
> >> +
> >> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> >> +
> >> +            /* Extract DIP of mbuf3 */
> >> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct
> >rte_ipv4_hdr *,
> >> +                            sizeof(struct rte_ether_hdr));
> >> +
> >> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
> >> +            node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr-
> >>hdr_checksum;
> >> +            node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live;
> >> +
> >> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> >> +    }
> >> +    while (n_left_from > 0) {
> >> +            mbuf0 = pkts[0];
> >> +            pkts += 1;
> >> +            n_left_from -= 1;
> >> +
> >> +            /* Extract DIP of mbuf0 */
> >> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
> >rte_ipv4_hdr *,
> >> +                            sizeof(struct rte_ether_hdr));
> >> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
> >> +            node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
> >>hdr_checksum;
> >> +            node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
> >> +
> >> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> >> +    }
> >> +
> >> +    rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
> >> +    if (unlikely(rc != 0))
> >> +            return 0;
> >> +
> >> +    for (i = 0; i < nb_objs; i++) {
> >> +            if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {
> >> +                    next_hop[i] = drop_nh;
> >maybe it is worth just do define FIB_DEFAULT_NH as a drop_nh vaue to omit
> >these next_hop reassignments?
> Ack.
> >> +                    lookup_err += 1;
> >> +            }
> >> +
> >> +            mbuf0 = (struct rte_mbuf *)objs[i];
> >> +            node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i];
> >> +            next = (uint16_t)(next_hop[i] >> 16);
> >> +
> >> +            if (unlikely(next_index ^ next)) {
> >> +                    /* Copy things successfully speculated till now */
> >> +                    rte_memcpy(to_next, from, last_spec *
> >sizeof(from[0]));
> >> +                    from += last_spec;
> >> +                    to_next += last_spec;
> >> +                    held += last_spec;
> >> +                    last_spec = 0;
> >> +
> >> +                    rte_node_enqueue_x1(graph, node, next, from[0]);
> >> +                    from += 1;
> >> +            } else {
> >> +                    last_spec += 1;
> >> +            }
> >> +    }
> >> +
> >> +    /* !!! Home run !!! */
> >> +    if (likely(last_spec == nb_objs)) {
> >> +            rte_node_next_stream_move(graph, node, next_index);
> >> +            return nb_objs;
> >> +    }
> >> +
> >> +    NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err);
> >> +    held += last_spec;
> >> +    rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
> >> +    rte_node_next_stream_put(graph, node, next_index, held);
> >> +
> >> +    return nb_objs;
> >> +}
> >> +
> >>   RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
> >>   int
> >>   rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t
> >> next_hop, @@ -147,6 +310,7 @@ static struct rte_node_xstats
> >ip4_lookup_fib_xstats = {
> >>   };
> >>
> >>   static struct rte_node_register ip4_lookup_fib_node = {
> >> +    .process = ip4_lookup_fib_node_process,
> >>      .name = "ip4_lookup_fib",
> >>
> >>      .init = ip4_lookup_fib_node_init,
> >
> >--
> >Regards,
> >Vladimir
>
>
  
Ankur Dwivedi April 23, 2025, 9:46 a.m. UTC | #5
Hi Vladimir,

>Hi Ankur,
>
>пт, 18 апр. 2025 г. в 15:45, Ankur Dwivedi <adwivedi@marvell.com
><mailto:adwivedi@marvell.com> >:
>
>
>
>	Hi Vladimir,
>	>> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
>	>> index e87864e672..c535b191f8 100644
>	>> --- a/lib/node/ip4_lookup_fib.c
>	>> +++ b/lib/node/ip4_lookup_fib.c
>	>> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main
>	>ip4_lookup_fib_nm;
>	>>   #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
>	>>      (((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
>	>>
>	>> +static uint16_t
>	>> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct
>rte_node
>	>*node, void **objs,
>	>> +                        uint16_t nb_objs)
>	>> +{
>	>> +    struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
>	>> +    struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
>	>> +    const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
>	>> +    struct rte_ipv4_hdr *ipv4_hdr;
>	>> +    uint64_t next_hop[nb_objs];
>	>> +    uint16_t lookup_err = 0;
>	>> +    void **to_next, **from;
>	>> +    uint16_t last_spec = 0;
>	>> +    rte_edge_t next_index;
>	>> +    uint16_t n_left_from;
>	>> +    uint32_t ip[nb_objs];
>	>> +    uint16_t held = 0;
>	>> +    uint32_t drop_nh;
>	>> +    uint16_t next;
>	>> +    int i, rc;
>	>> +
>	>> +    /* Speculative next */
>	>> +    next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
>	>> +    /* Drop node */
>	>> +    drop_nh =
>((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) <<
>	>16;
>	>> +
>	>> +    pkts = (struct rte_mbuf **)objs;
>	>> +    from = objs;
>	>> +    n_left_from = nb_objs;
>	>> +
>	>> +    /* Get stream for the speculated next node */
>	>> +    to_next = rte_node_next_stream_get(graph, node, next_index,
>	>> +nb_objs);
>	>> +
>	>> +    for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i +=
>	>OBJS_PER_CLINE)
>	>> +            rte_prefetch0(&objs[i]);
>	>
>	>Does this prefetching loop make any sense? Unless objs are not
>passed across
>	>threads this array likely to be in the cache already.
>	>
>	>And if objs are passed across threads, then why do you start
>prefetching from
>	>the next cache line instead of the first, and why don't you stop at
>nb_objs?
>	For example if cache size is 64 bytes. Then there will be 8 pointers to
>mbuf per cache line (if address size is 8 bytes). If nb_objs is 256, then the
>remaining pointers needs to be prefetched to cache. This loop helps to
>prefetch nb_objs pointers to cache.
>
>
>
>My comment was about necessity. I'll rephrase and enumerate my questions
>to be more clear:
>1. Are mbuf pointers contained in the objs array not in cache? My assumptions
>here are:
>    - If you run graph in RTC mode, objs array is very likely already be in your L1
>cache (since some previous node/nodes just put packets there)

Yes, the objs are more likely to be in L1 cache in RTC. But if they are not the above loop prefetches them.
>    - In dispatch mode it doesn't make much sense to run this lookup node in an
>another thread separately from the remaining nodes processing IPv4

If its run in another thread/lcore, the above prefetching will help.
>
>2. if you still thing this prefetching loop is required here, then:
>
>
>	The loop can be changed to stop at nb_objs.
>	for (i = OBJS_PER_CLINE; i < nb_objs; i += OBJS_PER_CLINE) { }
>
>
>
>  why you start from the OBJS_PER_CLINE (second cache line of the objs array)
Because the first line is prefetched in __rte_node_process() which is called in rte_graph_walk().

>3. "i < nb_objs". Should be "i < RTE_ALIGN_CEIL(nb_objs, OBJS_PER_CLINE) /
>OBJS_PER_CLINE"

In this loop i will be incremented by 1.  But the for loop which I mentioned above, also does the same thing but i is incremented by OBJS_PER_CLINE.
>
>
>	>
>	>> +
>	>> +#if RTE_GRAPH_BURST_SIZE > 64
>	>> +    for (i = 0; i < 4 && i < n_left_from; i++) {
>	>> +            rte_prefetch0(pkts[i]);
>	>> +            rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
>	>> +                                    sizeof(struct rte_ether_hdr)));
>	>
>	>This construction does not make sense to me. Same as similar
>constructions
>	>below
>	>
>	>The second prefetch has memory dependency of the data, that will
>be
>	>prefetched by the first one. Does removing this prefetch affects
>performance?
>	The first prefetches the data at mbuf address. The second prefetch is
>for the address containing ipv4 header. Both can be in separate cache lines, as
>ipv4 starts at mbuf->buf_addr + mbuf->data_off + sizeof(ethernet header).
>data_off is generally 64 bytes or 128 bytes depending on cache line size.
>	>
>
>
>
>Indeed, both of them are in separate cache lines. But my point here is about
>data dependency for the second prefetch. In order to issue the prefetch
>instruction for the cache line containing the ipv4 header you must know the
>address. You calculate this address by:
>rte_pktmbuf_mtod_offset(pkts[i], void *, sizeof(struct rte_ether_hdr))
>rte_pktmbuf_mtod_offset is accessing mbuf->buf_addr and mbuf->data_off ,
>as you mentioned. But, these fields are not in your L1 cache at the moment,
>because you just asked to prefetch them with the previous instruction.
Ack.
>So my suggestion here would be to look at how it is done in
>ipv4_lookup_sse.c:ip4_lookup_node_process_vec()
>Simplifying, in a run loop it prefetches mbuf + 2, then prefetches CL with the v4
>header of the mbuf + 1 (assuming in a previous invocation mbuf CL was
>already fetched into L1), and finally does processing of the mbuf + 0 (again,
>assuming the previous iteration ipv4 CL was fetched).
I have seen the implementation. The code looks fine to me. I will try to add similar code in fib lookup.
>
>
>	>> +    }
>	>> +#endif
>	>> +
>	>> +    i = 0;
>	>> +    while (n_left_from >= 4) {
>	>> +#if RTE_GRAPH_BURST_SIZE > 64
>	>> +            if (likely(n_left_from > 7)) {
>	>> +                    rte_prefetch0(pkts[4]);
>	>> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void
>	>*,
>	>> +                                    sizeof(struct rte_ether_hdr)));
>	>> +                    rte_prefetch0(pkts[5]);
>	>> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void
>	>*,
>	>> +                                    sizeof(struct rte_ether_hdr)));
>	>> +                    rte_prefetch0(pkts[6]);
>	>> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void
>	>*,
>	>> +                                    sizeof(struct rte_ether_hdr)));
>	>> +                    rte_prefetch0(pkts[7]);
>	>> +                    rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void
>	>*,
>	>> +                                    sizeof(struct rte_ether_hdr)));
>	>> +            }
>	>> +#endif
>	>> +
>	>> +            mbuf0 = pkts[0];
>	>> +            mbuf1 = pkts[1];
>	>> +            mbuf2 = pkts[2];
>	>> +            mbuf3 = pkts[3];
>	>> +            pkts += 4;
>	>> +            n_left_from -= 4;
>	>> +            /* Extract DIP of mbuf0 */
>	>> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
>	>rte_ipv4_hdr *,
>	>> +                            sizeof(struct rte_ether_hdr));
>	>> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
>	>> +            node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
>	>>hdr_checksum;
>	>> +            node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr-
>>time_to_live;
>	>> +
>	>> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>	>> +
>	>> +            /* Extract DIP of mbuf1 */
>	>> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct
>	>rte_ipv4_hdr *,
>	>> +                            sizeof(struct rte_ether_hdr));
>	>> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
>	>> +            node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr-
>	>>hdr_checksum;
>	>> +            node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr-
>>time_to_live;
>	>> +
>	>> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>	>> +
>	>> +            /* Extract DIP of mbuf2 */
>	>> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct
>	>rte_ipv4_hdr *,
>	>> +                            sizeof(struct rte_ether_hdr));
>	>> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
>	>> +            node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr-
>	>>hdr_checksum;
>	>> +            node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr-
>>time_to_live;
>	>> +
>	>> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>	>> +
>	>> +            /* Extract DIP of mbuf3 */
>	>> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct
>	>rte_ipv4_hdr *,
>	>> +                            sizeof(struct rte_ether_hdr));
>	>> +
>	>> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
>	>> +            node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr-
>	>>hdr_checksum;
>	>> +            node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr-
>>time_to_live;
>	>> +
>	>> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>	>> +    }
>	>> +    while (n_left_from > 0) {
>	>> +            mbuf0 = pkts[0];
>	>> +            pkts += 1;
>	>> +            n_left_from -= 1;
>	>> +
>	>> +            /* Extract DIP of mbuf0 */
>	>> +            ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct
>	>rte_ipv4_hdr *,
>	>> +                            sizeof(struct rte_ether_hdr));
>	>> +            /* Extract cksum, ttl as ipv4 hdr is in cache */
>	>> +            node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr-
>	>>hdr_checksum;
>	>> +            node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr-
>>time_to_live;
>	>> +
>	>> +            ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>	>> +    }
>	>> +
>	>> +    rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
>	>> +    if (unlikely(rc != 0))
>	>> +            return 0;
>	>> +
>	>> +    for (i = 0; i < nb_objs; i++) {
>	>> +            if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {
>	>> +                    next_hop[i] = drop_nh;
>	>maybe it is worth just do define FIB_DEFAULT_NH as a drop_nh vaue
>to omit
>	>these next_hop reassignments?
>	Ack.
>	>> +                    lookup_err += 1;
>	>> +            }
>	>> +
>	>> +            mbuf0 = (struct rte_mbuf *)objs[i];
>	>> +            node_mbuf_priv1(mbuf0, dyn)->nh =
>(uint16_t)next_hop[i];
>	>> +            next = (uint16_t)(next_hop[i] >> 16);
>	>> +
>	>> +            if (unlikely(next_index ^ next)) {
>	>> +                    /* Copy things successfully speculated till now */
>	>> +                    rte_memcpy(to_next, from, last_spec *
>	>sizeof(from[0]));
>	>> +                    from += last_spec;
>	>> +                    to_next += last_spec;
>	>> +                    held += last_spec;
>	>> +                    last_spec = 0;
>	>> +
>	>> +                    rte_node_enqueue_x1(graph, node, next, from[0]);
>	>> +                    from += 1;
>	>> +            } else {
>	>> +                    last_spec += 1;
>	>> +            }
>	>> +    }
>	>> +
>	>> +    /* !!! Home run !!! */
>	>> +    if (likely(last_spec == nb_objs)) {
>	>> +            rte_node_next_stream_move(graph, node, next_index);
>	>> +            return nb_objs;
>	>> +    }
>	>> +
>	>> +    NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0,
>lookup_err);
>	>> +    held += last_spec;
>	>> +    rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
>	>> +    rte_node_next_stream_put(graph, node, next_index, held);
>	>> +
>	>> +    return nb_objs;
>	>> +}
>	>> +
>	>>
>RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
>	>>   int
>	>>   rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t
>	>> next_hop, @@ -147,6 +310,7 @@ static struct rte_node_xstats
>	>ip4_lookup_fib_xstats = {
>	>>   };
>	>>
>	>>   static struct rte_node_register ip4_lookup_fib_node = {
>	>> +    .process = ip4_lookup_fib_node_process,
>	>>      .name = "ip4_lookup_fib",
>	>>
>	>>      .init = ip4_lookup_fib_node_init,
>	>
>	>--
>	>Regards,
>	>Vladimir
>
>
>
>
>
>--
>
>Regards,
>
>Vladimir

Regards,
Ankur
  

Patch

diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c
index e87864e672..c535b191f8 100644
--- a/lib/node/ip4_lookup_fib.c
+++ b/lib/node/ip4_lookup_fib.c
@@ -40,6 +40,169 @@  static struct ip4_lookup_fib_node_main ip4_lookup_fib_nm;
 #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \
 	(((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off)
 
+static uint16_t
+ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node *node, void **objs,
+			    uint16_t nb_objs)
+{
+	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts;
+	struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx);
+	const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx);
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint64_t next_hop[nb_objs];
+	uint16_t lookup_err = 0;
+	void **to_next, **from;
+	uint16_t last_spec = 0;
+	rte_edge_t next_index;
+	uint16_t n_left_from;
+	uint32_t ip[nb_objs];
+	uint16_t held = 0;
+	uint32_t drop_nh;
+	uint16_t next;
+	int i, rc;
+
+	/* Speculative next */
+	next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE;
+	/* Drop node */
+	drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16;
+
+	pkts = (struct rte_mbuf **)objs;
+	from = objs;
+	n_left_from = nb_objs;
+
+	/* Get stream for the speculated next node */
+	to_next = rte_node_next_stream_get(graph, node, next_index, nb_objs);
+
+	for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i += OBJS_PER_CLINE)
+		rte_prefetch0(&objs[i]);
+
+#if RTE_GRAPH_BURST_SIZE > 64
+	for (i = 0; i < 4 && i < n_left_from; i++) {
+		rte_prefetch0(pkts[i]);
+		rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
+					sizeof(struct rte_ether_hdr)));
+	}
+#endif
+
+	i = 0;
+	while (n_left_from >= 4) {
+#if RTE_GRAPH_BURST_SIZE > 64
+		if (likely(n_left_from > 7)) {
+			rte_prefetch0(pkts[4]);
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void *,
+					sizeof(struct rte_ether_hdr)));
+			rte_prefetch0(pkts[5]);
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void *,
+					sizeof(struct rte_ether_hdr)));
+			rte_prefetch0(pkts[6]);
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void *,
+					sizeof(struct rte_ether_hdr)));
+			rte_prefetch0(pkts[7]);
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void *,
+					sizeof(struct rte_ether_hdr)));
+		}
+#endif
+
+		mbuf0 = pkts[0];
+		mbuf1 = pkts[1];
+		mbuf2 = pkts[2];
+		mbuf3 = pkts[3];
+		pkts += 4;
+		n_left_from -= 4;
+		/* Extract DIP of mbuf0 */
+		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
+				sizeof(struct rte_ether_hdr));
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
+		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
+
+		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+
+		/* Extract DIP of mbuf1 */
+		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct rte_ipv4_hdr *,
+				sizeof(struct rte_ether_hdr));
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr->hdr_checksum;
+		node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live;
+
+		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+
+		/* Extract DIP of mbuf2 */
+		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct rte_ipv4_hdr *,
+				sizeof(struct rte_ether_hdr));
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr->hdr_checksum;
+		node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live;
+
+		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+
+		/* Extract DIP of mbuf3 */
+		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct rte_ipv4_hdr *,
+				sizeof(struct rte_ether_hdr));
+
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr->hdr_checksum;
+		node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live;
+
+		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+	}
+	while (n_left_from > 0) {
+		mbuf0 = pkts[0];
+		pkts += 1;
+		n_left_from -= 1;
+
+		/* Extract DIP of mbuf0 */
+		ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct rte_ipv4_hdr *,
+				sizeof(struct rte_ether_hdr));
+		/* Extract cksum, ttl as ipv4 hdr is in cache */
+		node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr->hdr_checksum;
+		node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live;
+
+		ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+	}
+
+	rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs);
+	if (unlikely(rc != 0))
+		return 0;
+
+	for (i = 0; i < nb_objs; i++) {
+		if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) {
+			next_hop[i] = drop_nh;
+			lookup_err += 1;
+		}
+
+		mbuf0 = (struct rte_mbuf *)objs[i];
+		node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i];
+		next = (uint16_t)(next_hop[i] >> 16);
+
+		if (unlikely(next_index ^ next)) {
+			/* Copy things successfully speculated till now */
+			rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
+			from += last_spec;
+			to_next += last_spec;
+			held += last_spec;
+			last_spec = 0;
+
+			rte_node_enqueue_x1(graph, node, next, from[0]);
+			from += 1;
+		} else {
+			last_spec += 1;
+		}
+	}
+
+	/* !!! Home run !!! */
+	if (likely(last_spec == nb_objs)) {
+		rte_node_next_stream_move(graph, node, next_index);
+		return nb_objs;
+	}
+
+	NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err);
+	held += last_spec;
+	rte_memcpy(to_next, from, last_spec * sizeof(from[0]));
+	rte_node_next_stream_put(graph, node, next_index, held);
+
+	return nb_objs;
+}
+
 RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07)
 int
 rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t next_hop,
@@ -147,6 +310,7 @@  static struct rte_node_xstats ip4_lookup_fib_xstats = {
 };
 
 static struct rte_node_register ip4_lookup_fib_node = {
+	.process = ip4_lookup_fib_node_process,
 	.name = "ip4_lookup_fib",
 
 	.init = ip4_lookup_fib_node_init,