[RFC,5/6] examples/l3fwd: make ACL work in pipeline and eventdev modes

Message ID 20240815085339.1434-6-konstantin.v.ananyev@yandex.ru (mailing list archive)
State Superseded
Delegated to: Thomas Monjalon
Headers
Series Stage-Ordered API and other extensions for ring library |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Konstantin Ananyev Aug. 15, 2024, 8:53 a.m. UTC
From: Konstantin Ananyev <konstantin.ananyev@huawei.com>

Note upfront:
This is a huge commit that is combined from several ones.
For now, I submit it just for reference and demonstration purposes and
will probably remove it in future versions.
If will decide to go ahead with it, then it needs to be reworked and split
into several proper commits.

It adds for l3fwd:
 - eventdev mode for ACL lookup-mode
 - Introduce a worker-pool-mode
   (right now implemented for ACL lookup-mode only).
Worker-Pool mode is a simple pipeline model, with the following stages:
 1) I/O thread receives packets from NIC RX HW queues and enqueues them
    into the work queue
 2) Worker thread reads packets from the work queue(s),
    process them and then puts processed packets back into the
    work queue along with the processing status (routing info/error code).
 3) I/O thread dequeues packets and their status from the work queue,
    and based on it either TX packet or drops it.
Very similar to l3fwd-eventdev working model.

Note that it could be several I/O threads, each can serve one or multiple
HW RX queues. Also there could be several Worker threads, each of them can
process packets from multiple work queues in round-robin fashion.

Work queue can be one of the following types:
 - wqorder: allows Worker threads to process packets in any order,
   but guarantees that on dequeue stage the ingress order of packets
   will be preserved. I.E. at stage #3, I/O thread will get packets
   exactly in the same order as they were enqueued at stage #1.
 - wqunorder: doesn't provide any ordered guarantees.

'wqunroder' mode is implemented using 2 rte_ring structures per queue.
'wqorder' mode is implemtened using rte_soring structure per queue.

To facilitate this new functionality, command line parameters were
extended:
 --mode:
   Possible values one of: poll/eventdev/wqorder/wqorderS/wqunorder/wqunorderS
   Default value: poll
   - wqorder: Worker-Pool ordered mode with a separate work queue for each
     HW RX queue.
   - wqorderS: Worker-Pool ordered mode with one work queue per I/O thread.
   - wqunorder: Worker-Pool un-ordered mode with a separate work queue for each
     HW RX queue.
   - wqunorderS: Worker-Pool un-ordered mode with oen work queue per I/O thread.
 --wqsize: number of elements for each worker queue.
 --lookup-iter: forces to perform ACL lookup several times over the same
   packet. This is artificial parameter and is added temporally for
   benchmarking purposes. Will be removed in latest versions (if any).

Note that in Worker-Pool mode all free lcores that were not assigned as
I/O threads will be used as Worker threads.
As an example:
dpdk-l3fwd --lcores=53,55,57,59,61 ... -- \
-P -p f --config '(0,0,53)(1,0,53)(2,0,53)(3,0,53)' --lookup acl \
--parse-ptype --mode=wqorder ...
In that case lcore 53 will be used as I/O thread (stages #1,3)
to serve 4 HW RX queues,
while lcores 55,57,59,61 will serve as Worker threads (stage #2).

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 examples/l3fwd/l3fwd.h           |  55 +++++++
 examples/l3fwd/l3fwd_acl.c       | 125 +++++++++++---
 examples/l3fwd/l3fwd_acl_event.h | 258 +++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_event.c     |  14 ++
 examples/l3fwd/l3fwd_event.h     |   1 +
 examples/l3fwd/l3fwd_sse.h       |  49 +++++-
 examples/l3fwd/l3fwd_wqp.c       | 274 +++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_wqp.h       | 132 +++++++++++++++
 examples/l3fwd/main.c            |  75 ++++++++-
 examples/l3fwd/meson.build       |   1 +
 10 files changed, 956 insertions(+), 28 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_acl_event.h
 create mode 100644 examples/l3fwd/l3fwd_wqp.c
 create mode 100644 examples/l3fwd/l3fwd_wqp.h
  

Patch

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 93ce652d02..218f363764 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -77,6 +77,42 @@  struct __rte_cache_aligned lcore_rx_queue {
 	uint16_t queue_id;
 };
 
+enum L3FWD_WORKER_MODE {
+	L3FWD_WORKER_POLL,
+	L3FWD_WORKER_UNQUE,
+	L3FWD_WORKER_ORQUE,
+};
+
+struct l3fwd_wqp_param {
+	enum L3FWD_WORKER_MODE mode;
+	uint32_t qsize;    /**< Number of elems in worker queue */
+	int32_t single;    /**< use single queue per I/O (poll) thread */
+};
+
+extern struct l3fwd_wqp_param l3fwd_wqp_param;
+
+enum {
+	LCORE_WQ_IN,
+	LCORE_WQ_OUT,
+	LCORE_WQ_NUM,
+};
+
+union lcore_wq {
+	struct rte_ring *r[LCORE_WQ_NUM];
+	struct {
+		struct rte_soring *sor;
+		/* used by WQ, sort of thred-local var */
+		uint32_t ftoken;
+	};
+};
+
+struct lcore_wq_pool {
+	uint32_t nb_queue;
+	uint32_t qmask;
+	union lcore_wq queue[MAX_RX_QUEUE_PER_LCORE];
+	struct l3fwd_wqp_param prm;
+};
+
 struct __rte_cache_aligned lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
@@ -86,6 +122,7 @@  struct __rte_cache_aligned lcore_conf {
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
 	void *ipv6_lookup_struct;
+	struct lcore_wq_pool wqpool;
 };
 
 extern volatile bool force_quit;
@@ -115,6 +152,8 @@  extern struct acl_algorithms acl_alg[];
 
 extern uint32_t max_pkt_len;
 
+extern uint32_t l3fwd_lookup_iter_num;
+
 /* Send burst of packets on an output interface */
 static inline int
 send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
@@ -308,6 +347,22 @@  fib_event_main_loop_tx_q_vector(__rte_unused void *dummy);
 int
 fib_event_main_loop_tx_q_burst_vector(__rte_unused void *dummy);
 
+int
+acl_event_main_loop_tx_d(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_d_burst(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_q(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_q_burst(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_d_vector(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_d_burst_vector(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_q_vector(__rte_unused void *dummy);
+int
+acl_event_main_loop_tx_q_burst_vector(__rte_unused void *dummy);
 
 /* Return ipv4/ipv6 fwd lookup struct for ACL, LPM, EM or FIB. */
 void *
diff --git a/examples/l3fwd/l3fwd_acl.c b/examples/l3fwd/l3fwd_acl.c
index b635011ef7..8b550e495a 100644
--- a/examples/l3fwd/l3fwd_acl.c
+++ b/examples/l3fwd/l3fwd_acl.c
@@ -4,6 +4,7 @@ 
 
 #include "l3fwd.h"
 #include "l3fwd_route.h"
+#include "l3fwd_wqp.h"
 
 /*
  * Rule and trace formats definitions.
@@ -1003,19 +1004,21 @@  acl_process_pkts(struct rte_mbuf *pkts[MAX_PKT_BURST],
 	/* split packets burst depending on packet type (IPv4/IPv6) */
 	l3fwd_acl_prepare_acl_parameter(pkts, &acl_search, num);
 
-	if (acl_search.num_ipv4)
-		rte_acl_classify(acl_config.acx_ipv4[socketid],
+	for (i = l3fwd_lookup_iter_num; i != 0; i--) {
+		if (acl_search.num_ipv4)
+			rte_acl_classify(acl_config.acx_ipv4[socketid],
 				acl_search.data_ipv4,
 				acl_search.res_ipv4,
 				acl_search.num_ipv4,
 				DEFAULT_MAX_CATEGORIES);
 
-	if (acl_search.num_ipv6)
-		rte_acl_classify(acl_config.acx_ipv6[socketid],
+		if (acl_search.num_ipv6)
+			rte_acl_classify(acl_config.acx_ipv6[socketid],
 				acl_search.data_ipv6,
 				acl_search.res_ipv6,
 				acl_search.num_ipv6,
 				DEFAULT_MAX_CATEGORIES);
+	}
 
 	/* combine lookup results back, into one array of next hops */
 	n4 = 0;
@@ -1042,34 +1045,36 @@  acl_process_pkts(struct rte_mbuf *pkts[MAX_PKT_BURST],
 
 static inline void
 acl_send_packets(struct lcore_conf *qconf, struct rte_mbuf *pkts[],
-	uint16_t hops[], uint32_t num)
+	uint16_t hops[], uint32_t num, int step3)
 {
 #if defined ACL_SEND_MULTI
-	send_packets_multi(qconf, pkts, hops, num);
+	__send_packets_multi(qconf, pkts, hops, num, step3);
 #else
-	send_packets_single(qconf, pkts, hops, num);
+	if (step3 != 0)
+		send_packets_single(qconf, pkts, hops, num);
+	else {
+		uint32_t i;
+		for (i = 0; i != num; i++)
+			send_single_packet(qconf, pkts[i], hops[i]);
+	}
 #endif
 }
 
 /* main processing loop */
-int
-acl_main_loop(__rte_unused void *dummy)
+static int
+acl_poll_loop(struct lcore_conf *qconf, uint32_t lcore_id)
 {
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
 	uint16_t hops[MAX_PKT_BURST];
-	unsigned int lcore_id;
 	uint64_t prev_tsc, diff_tsc, cur_tsc;
-	int i, nb_rx;
+	uint32_t i, n, nb_rx;
 	uint16_t portid;
 	uint16_t queueid;
-	struct lcore_conf *qconf;
 	int socketid;
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1)
 			/ US_PER_S * BURST_TX_DRAIN_US;
 
 	prev_tsc = 0;
-	lcore_id = rte_lcore_id();
-	qconf = &lcore_conf[lcore_id];
 	socketid = rte_lcore_to_socket_id(lcore_id);
 
 	if (qconf->n_rx_queue == 0) {
@@ -1121,17 +1126,99 @@  acl_main_loop(__rte_unused void *dummy)
 			nb_rx = rte_eth_rx_burst(portid, queueid,
 				pkts_burst, MAX_PKT_BURST);
 
-			if (nb_rx > 0) {
-				acl_process_pkts(pkts_burst, hops, nb_rx,
-					socketid);
-				acl_send_packets(qconf, pkts_burst, hops,
-					nb_rx);
+			if (nb_rx != 0) {
+				if (l3fwd_wqp_param.mode == L3FWD_WORKER_POLL) {
+					acl_process_pkts(pkts_burst, hops,
+						nb_rx, socketid);
+					acl_send_packets(qconf, pkts_burst,
+						hops, nb_rx, 1);
+				} else {
+					n = lcore_wq_submit(&qconf->wqpool, i,
+						pkts_burst, nb_rx);
+					if (n != nb_rx) {
+						/* update stats counter */
+						rte_pktmbuf_free_bulk(
+							pkts_burst + n,
+							nb_rx - n);
+					}
+				}
+			}
+			if (l3fwd_wqp_param.mode != L3FWD_WORKER_POLL) {
+				nb_rx = lcore_wq_receive(&qconf->wqpool, i,
+					pkts_burst, hops, MAX_PKT_BURST);
+				if (nb_rx != 0)
+					acl_send_packets(qconf, pkts_burst,
+						hops, nb_rx, 0);
+			}
+		}
+	}
+	return 0;
+}
+
+/* WT processing loop */
+static int
+acl_wqp_loop(struct lcore_conf *qconf, uint32_t lcore_id)
+{
+	int32_t socketid;
+	uint32_t i, k, n;
+	struct rte_mbuf *pkts[MAX_PKT_BURST];
+	uint16_t hops[MAX_PKT_BURST];
+
+	socketid = rte_lcore_to_socket_id(lcore_id);
+
+	if (qconf->wqpool.nb_queue == 0) {
+		RTE_LOG(INFO, L3FWD, "%s: lcore %u has nothing to do\n",
+			__func__, lcore_id);
+		return 0;
+	}
+
+	RTE_LOG(INFO, L3FWD, "%s: entering loop on lcore %u\n",
+		__func__, lcore_id);
+
+	while (!force_quit) {
+
+		/*
+		 * Read packet from internal queues and process them
+		 */
+		for (i = 0; i < qconf->wqpool.nb_queue; ++i) {
+
+			n = lcore_wq_pull(&qconf->wqpool, i, pkts,
+				RTE_DIM(pkts));
+			if (n == 0)
+				continue;
+
+			acl_process_pkts(pkts, hops, n, socketid);
+			process_step3_burst(pkts, hops, n);
+			k = lcore_wq_push(&qconf->wqpool, i, pkts, hops, n);
+			if (n != k) {
+				/* stats update */
+				rte_pktmbuf_free_bulk(pkts + k, n - k);
 			}
 		}
 	}
 	return 0;
 }
 
+/* main processing loop */
+int
+acl_main_loop(__rte_unused void *dummy)
+{
+	uint32_t lcore_id;
+	struct lcore_conf *qconf;
+
+	lcore_id = rte_lcore_id();
+	qconf = &lcore_conf[lcore_id];
+
+	if (qconf->n_rx_queue != 0)
+		return acl_poll_loop(qconf, lcore_id);
+	else
+		return acl_wqp_loop(qconf, lcore_id);
+}
+
+#ifdef RTE_LIB_EVENTDEV
+#include "l3fwd_acl_event.h"
+#endif
+
 /* Not used by L3fwd ACL. */
 void *
 acl_get_ipv4_l3fwd_lookup_struct(__rte_unused const int socketid)
diff --git a/examples/l3fwd/l3fwd_acl_event.h b/examples/l3fwd/l3fwd_acl_event.h
new file mode 100644
index 0000000000..240dd3fb03
--- /dev/null
+++ b/examples/l3fwd/l3fwd_acl_event.h
@@ -0,0 +1,258 @@ 
+#include "l3fwd_event.h"
+
+/* One eventdev loop for single and burst using acl. */
+static __rte_always_inline void
+acl_event_loop(struct l3fwd_event_resources *evt_rsrc,
+		const uint8_t flags)
+{
+	uint32_t i, lcore_id, nb_deq, nb_enq;
+	int32_t socketid;
+	uint16_t hops[MAX_PKT_BURST];
+	struct rte_mbuf *pkts[MAX_PKT_BURST];
+	struct rte_event events[MAX_PKT_BURST];
+
+	const int event_p_id = l3fwd_get_free_event_port(evt_rsrc);
+	const uint8_t tx_q_id = evt_rsrc->evq.event_q_id[
+			evt_rsrc->evq.nb_queues - 1];
+	const uint8_t event_d_id = evt_rsrc->event_d_id;
+	const uint16_t deq_len = RTE_MIN(evt_rsrc->deq_depth, MAX_PKT_BURST);
+
+	if (event_p_id < 0)
+		return;
+
+	lcore_id = rte_lcore_id();
+	socketid = rte_lcore_to_socket_id(lcore_id);
+
+	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
+
+	nb_deq = 0;
+	nb_enq = 0;
+
+	while (!force_quit) {
+		/* Read events from RX queues. */
+		nb_deq = rte_event_dequeue_burst(event_d_id, event_p_id,
+				events, deq_len, 0);
+		if (nb_deq == 0) {
+			rte_pause();
+			continue;
+		}
+
+		for (i = 0; i != nb_deq; i++) {
+			pkts[i] = events[i].mbuf;
+			if (flags & L3FWD_EVENT_TX_ENQ) {
+				events[i].queue_id = tx_q_id;
+				events[i].op = RTE_EVENT_OP_FORWARD;
+			}
+			rte_event_eth_tx_adapter_txq_set(pkts[i], 0);
+		}
+
+		acl_process_pkts(pkts, hops, nb_deq, socketid);
+
+		for (i = 0; i != nb_deq; i++) {
+			process_packet(pkts[i], &hops[i]);
+			pkts[i]->port = (hops[i] != BAD_PORT) ?
+				       hops[i] : pkts[i]->port;
+		}
+
+		if (flags & L3FWD_EVENT_TX_ENQ) {
+			nb_enq = rte_event_enqueue_burst(event_d_id, event_p_id,
+					events, nb_deq);
+			while (nb_enq < nb_deq && !force_quit)
+				nb_enq += rte_event_enqueue_burst(event_d_id,
+						event_p_id, events + nb_enq,
+						nb_deq - nb_enq);
+		}
+
+		if (flags & L3FWD_EVENT_TX_DIRECT) {
+			nb_enq = rte_event_eth_tx_adapter_enqueue(event_d_id,
+					event_p_id, events, nb_deq, 0);
+			while (nb_enq < nb_deq && !force_quit)
+				nb_enq += rte_event_eth_tx_adapter_enqueue(
+						event_d_id, event_p_id,
+						events + nb_enq,
+						nb_deq - nb_enq, 0);
+		}
+	}
+
+	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
+				   nb_deq, 0);
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_d(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc =
+			l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop(evt_rsrc, L3FWD_EVENT_TX_DIRECT);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_d_burst(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc =
+			l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop(evt_rsrc, L3FWD_EVENT_TX_DIRECT);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_q(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc =
+			l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop(evt_rsrc, L3FWD_EVENT_TX_ENQ);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_q_burst(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc =
+			l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop(evt_rsrc, L3FWD_EVENT_TX_ENQ);
+	return 0;
+}
+
+static __rte_always_inline void
+acl_process_event_vector(struct rte_event_vector *vec, uint16_t *hops,
+	int32_t socketid)
+{
+	uint32_t i, k;
+
+	for (i = 0; i != vec->nb_elem; i += k) {
+		k = RTE_MIN(vec->nb_elem - i, (uint32_t)MAX_PKT_BURST);
+		acl_process_pkts(vec->mbufs + i, hops + i, k, socketid);
+	}
+
+#if defined ACL_SEND_MULTI
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
+}
+
+static __rte_always_inline void
+acl_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
+		      const uint8_t flags)
+{
+	uint16_t *hops;
+	int32_t socketid;
+	uint32_t i, lcore_id, nb_deq, nb_enq;
+	struct rte_event events[MAX_PKT_BURST];
+
+	const int event_p_id = l3fwd_get_free_event_port(evt_rsrc);
+	const uint8_t tx_q_id =
+		evt_rsrc->evq.event_q_id[evt_rsrc->evq.nb_queues - 1];
+	const uint8_t event_d_id = evt_rsrc->event_d_id;
+	const uint16_t deq_len = evt_rsrc->deq_depth;
+
+	if (event_p_id < 0)
+		return;
+
+	lcore_id = rte_lcore_id();
+	socketid = rte_lcore_to_socket_id(lcore_id);
+
+	hops = rte_zmalloc_socket(NULL, sizeof(hops[0]) * evt_rsrc->vector_size,
+			RTE_CACHE_LINE_SIZE, socketid);
+	if (hops == NULL) {
+		RTE_LOG(ERR, L3FWD,
+			"%s: failed to alloc internal buffers on lcore %u\n",
+			__func__, lcore_id);
+		return;
+	}
+
+	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
+
+	nb_deq = 0;
+	nb_enq = 0;
+
+	while (!force_quit) {
+		/* Read events from RX queues. */
+		nb_deq = rte_event_dequeue_burst(event_d_id, event_p_id, events,
+						 deq_len, 0);
+		if (nb_deq == 0) {
+			rte_pause();
+			continue;
+		}
+
+		for (i = 0; i < nb_deq; i++) {
+			if (flags & L3FWD_EVENT_TX_ENQ) {
+				events[i].queue_id = tx_q_id;
+				events[i].op = RTE_EVENT_OP_FORWARD;
+			}
+
+			acl_process_event_vector(events[i].vec, hops, socketid);
+		}
+
+		if (flags & L3FWD_EVENT_TX_ENQ) {
+			nb_enq = rte_event_enqueue_burst(event_d_id, event_p_id,
+							 events, nb_deq);
+			while (nb_enq < nb_deq && !force_quit)
+				nb_enq += rte_event_enqueue_burst(
+					event_d_id, event_p_id, events + nb_enq,
+					nb_deq - nb_enq);
+		}
+
+		if (flags & L3FWD_EVENT_TX_DIRECT) {
+			nb_enq = rte_event_eth_tx_adapter_enqueue(
+				event_d_id, event_p_id, events, nb_deq, 0);
+			while (nb_enq < nb_deq && !force_quit)
+				nb_enq += rte_event_eth_tx_adapter_enqueue(
+					event_d_id, event_p_id, events + nb_enq,
+					nb_deq - nb_enq, 0);
+		}
+	}
+
+	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
+				   nb_deq, 1);
+	rte_free(hops);
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_d_vector(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc = l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop_vector(evt_rsrc, L3FWD_EVENT_TX_DIRECT);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_d_burst_vector(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc = l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop_vector(evt_rsrc, L3FWD_EVENT_TX_DIRECT);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_q_vector(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc = l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop_vector(evt_rsrc, L3FWD_EVENT_TX_ENQ);
+	return 0;
+}
+
+int __rte_noinline
+acl_event_main_loop_tx_q_burst_vector(__rte_unused void *dummy)
+{
+	struct l3fwd_event_resources *evt_rsrc = l3fwd_get_eventdev_rsrc();
+
+	acl_event_loop_vector(evt_rsrc, L3FWD_EVENT_TX_ENQ);
+	return 0;
+}
diff --git a/examples/l3fwd/l3fwd_event.c b/examples/l3fwd/l3fwd_event.c
index c698ed27d2..cd8466a605 100644
--- a/examples/l3fwd/l3fwd_event.c
+++ b/examples/l3fwd/l3fwd_event.c
@@ -252,6 +252,16 @@  l3fwd_event_resource_setup(struct rte_eth_conf *port_conf)
 		[1][1][0] = fib_event_main_loop_tx_q_vector,
 		[1][1][1] = fib_event_main_loop_tx_q_burst_vector,
 	};
+	const event_loop_cb acl_event_loop[2][2][2] = {
+		[0][0][0] = acl_event_main_loop_tx_d,
+		[0][0][1] = acl_event_main_loop_tx_d_burst,
+		[0][1][0] = acl_event_main_loop_tx_q,
+		[0][1][1] = acl_event_main_loop_tx_q_burst,
+		[1][0][0] = acl_event_main_loop_tx_d_vector,
+		[1][0][1] = acl_event_main_loop_tx_d_burst_vector,
+		[1][1][0] = acl_event_main_loop_tx_q_vector,
+		[1][1][1] = acl_event_main_loop_tx_q_burst_vector,
+	};
 	uint32_t event_queue_cfg;
 	int ret;
 
@@ -295,6 +305,10 @@  l3fwd_event_resource_setup(struct rte_eth_conf *port_conf)
 	evt_rsrc->ops.fib_event_loop =
 		fib_event_loop[evt_rsrc->vector_enabled][evt_rsrc->tx_mode_q]
 			      [evt_rsrc->has_burst];
+
+	evt_rsrc->ops.acl_event_loop =
+		acl_event_loop[evt_rsrc->vector_enabled][evt_rsrc->tx_mode_q]
+			      [evt_rsrc->has_burst];
 }
 
 static void
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index c6a4a89127..21aa24c396 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -58,6 +58,7 @@  struct l3fwd_event_setup_ops {
 	event_loop_cb lpm_event_loop;
 	event_loop_cb em_event_loop;
 	event_loop_cb fib_event_loop;
+	event_loop_cb acl_event_loop;
 };
 
 struct l3fwd_event_resources {
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 083729cdef..29c5c7c014 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -86,12 +86,35 @@  process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
 	_mm_storeu_si128((__m128i *)eth_hdr, te);
 }
 
+static inline void
+process_step3_burst(struct rte_mbuf *pkt[], uint16_t dst_port[], uint32_t num)
+{
+	uint32_t i, k;
+
+	k = RTE_ALIGN_FLOOR(num, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(pkt + i, dst_port + i);
+
+	/* Process up to last 3 packets one by one. */
+	switch (num % FWDSTEP) {
+	case 3:
+		process_packet(pkt[i + 2], dst_port + i + 2);
+		/* fall-through */
+	case 2:
+		process_packet(pkt[i + 1], dst_port + i + 1);
+		/* fall-through */
+	case 1:
+		process_packet(pkt[i], dst_port + i);
+	}
+}
+
 /**
  * Send packets burst from pkts_burst to the ports in dst_port array
  */
 static __rte_always_inline void
-send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
-		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+__send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx, int step3)
 {
 	int32_t k;
 	int j = 0;
@@ -110,13 +133,15 @@  send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 		lp = pnum;
 		lp[0] = 1;
 
-		processx4_step3(pkts_burst, dst_port);
+		if (step3 != 0)
+			processx4_step3(pkts_burst, dst_port);
 
 		/* dp1: <d[0], d[1], d[2], d[3], ... > */
 		dp1 = _mm_loadu_si128((__m128i *)dst_port);
 
 		for (j = FWDSTEP; j != k; j += FWDSTEP) {
-			processx4_step3(&pkts_burst[j], &dst_port[j]);
+			if (step3 != 0)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 			/*
 			 * dp2:
@@ -155,17 +180,20 @@  send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	/* Process up to last 3 packets one by one. */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
-		process_packet(pkts_burst[j], dst_port + j);
+		if (step3 != 0)
+			process_packet(pkts_burst[j], dst_port + j);
 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
 		j++;
 		/* fall-through */
 	case 2:
-		process_packet(pkts_burst[j], dst_port + j);
+		if (step3 != 0)
+			process_packet(pkts_burst[j], dst_port + j);
 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
 		j++;
 		/* fall-through */
 	case 1:
-		process_packet(pkts_burst[j], dst_port + j);
+		if (step3 != 0)
+			process_packet(pkts_burst[j], dst_port + j);
 		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
 		j++;
 	}
@@ -194,6 +222,13 @@  send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	__send_packets_multi(qconf, pkts_burst, dst_port, nb_rx, 1);
+}
+
 static __rte_always_inline uint16_t
 process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 {
diff --git a/examples/l3fwd/l3fwd_wqp.c b/examples/l3fwd/l3fwd_wqp.c
new file mode 100644
index 0000000000..eb5b80e358
--- /dev/null
+++ b/examples/l3fwd/l3fwd_wqp.c
@@ -0,0 +1,274 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Huawei Technologies Co., Ltd
+ */
+
+#include "l3fwd.h"
+#include "l3fwd_wqp.h"
+
+static int
+wqp_ring_init(struct rte_ring **r, uint32_t num, size_t sz, int32_t sid,
+	uint32_t flags)
+{
+	char name[RTE_RING_NAMESIZE];
+
+	*r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, sid);
+	if (*r == NULL)
+		return -ENOMEM;
+
+	snprintf(name, sizeof(name), "%p", *r);
+	return rte_ring_init(*r, name, num, flags);
+}
+
+static int
+wqp_soring_init(struct rte_soring **r, struct rte_soring_param *prm, size_t sz,
+	int32_t sid)
+{
+	char name[RTE_RING_NAMESIZE];
+
+	*r = rte_zmalloc_socket(NULL, sz, RTE_CACHE_LINE_SIZE, sid);
+	if (*r == NULL)
+		return -ENOMEM;
+
+	snprintf(name, sizeof(name), "%p", *r);
+	prm->name = name;
+	return rte_soring_init(*r, prm);
+}
+
+static void
+wqp_fini(struct lcore_conf *lc)
+{
+	uint32_t i, j;
+
+	if (lc->n_rx_queue == 0)
+		return;
+
+	for (i = 0; i != lc->wqpool.nb_queue; i++) {
+		for (j = 0; j != RTE_DIM(lc->wqpool.queue[i].r); j++)
+			rte_free(lc->wqpool.queue[i].r[j]);
+	}
+
+	memset(&lc->wqpool, 0, sizeof(lc->wqpool));
+}
+
+static int
+l3fwd_wqp_unque_init(struct lcore_conf *lc, const struct l3fwd_wqp_param *prm,
+	uint32_t lcid)
+{
+	int32_t rc, sid;
+	uint32_t i, n, nq;
+	size_t szi, szo;
+
+	sid = rte_lcore_to_socket_id(lcid);
+
+	n = rte_align32pow2(prm->qsize);
+
+	szi = rte_ring_get_memsize(n);
+	szo = rte_ring_get_memsize_elem(sizeof(struct wqelm), n);
+
+	nq = (prm->single == 0) ? lc->n_rx_queue : 1;
+	lc->wqpool.nb_queue = nq;
+	lc->wqpool.qmask = (prm->single != 0) ? 0 : UINT32_MAX;
+
+	rc = 0;
+	for (i = 0; i != nq; i++) {
+
+		rc = wqp_ring_init(&lc->wqpool.queue[i].r[LCORE_WQ_IN], n, szi,
+			sid, RING_F_SP_ENQ);
+		if (rc != 0)
+			break;
+
+		rc = wqp_ring_init(&lc->wqpool.queue[i].r[LCORE_WQ_OUT], n, szo,
+			sid, RING_F_SC_DEQ);
+		if (rc != 0)
+			break;
+	}
+
+	if (i != nq) {
+		printf("error: %s failed at %u-th queue, error code: %d\n",
+			__func__, i, rc);
+		wqp_fini(lc);
+	}
+
+	lc->wqpool.prm = *prm;
+	return rc;
+}
+
+static int
+l3fwd_wqp_orque_init(struct lcore_conf *lc, const struct l3fwd_wqp_param *qprm,
+	uint32_t lcid)
+{
+	int32_t rc, sid;
+	uint32_t i, n, nq;
+	ssize_t sz;
+	struct rte_soring_param prm;
+
+	sid = rte_lcore_to_socket_id(lcid);
+
+	memset(&prm, 0, sizeof(prm));
+
+	n = 2 * qprm->qsize;
+	prm.elems = n;
+	prm.esize = sizeof(uintptr_t);
+	prm.stsize = sizeof(uint32_t);
+	prm.stages = 1;
+	prm.prod_synt = RTE_RING_SYNC_ST;
+	prm.cons_synt = RTE_RING_SYNC_ST;
+
+	sz = rte_soring_get_memsize(&prm);
+	if (sz < 0)
+		return sz;
+
+	nq = (qprm->single == 0) ? lc->n_rx_queue : 1;
+	lc->wqpool.nb_queue = nq;
+	lc->wqpool.qmask = (qprm->single != 0) ? 0 : UINT32_MAX;
+
+	rc = 0;
+	for (i = 0; i != nq; i++) {
+
+		rc = wqp_soring_init(&lc->wqpool.queue[i].sor, &prm, sz, sid);
+		if (rc != 0)
+			break;
+	}
+
+	if (i != nq) {
+		printf("error: %s failed at %u-th queue, error code: %d\n",
+			__func__, i, rc);
+		wqp_fini(lc);
+	}
+
+	lc->wqpool.prm = *qprm;
+	return rc;
+}
+
+static int
+wqp_init(struct lcore_conf *lc, const struct l3fwd_wqp_param *prm,
+	uint32_t lcid)
+{
+	/* this is I/O poll lcore */
+	if (lc->n_rx_queue != 0) {
+		if (prm->mode == L3FWD_WORKER_UNQUE)
+			return l3fwd_wqp_unque_init(lc, prm, lcid);
+		else if (prm->mode == L3FWD_WORKER_ORQUE)
+			return l3fwd_wqp_orque_init(lc, prm, lcid);
+		else
+			return -ENOTSUP;
+	}
+
+	return -ENOTSUP;
+}
+
+void
+l3fwd_wqp_fini(struct lcore_conf lc[RTE_MAX_LCORE])
+{
+	uint32_t lcid;
+
+	for (lcid = 0; lcid != RTE_MAX_LCORE; lcid++)
+		wqp_fini(lc + lcid);
+}
+
+static int
+check_set_wqp_param(struct l3fwd_wqp_param *prm)
+{
+	uint32_t n;
+
+	if (prm->qsize == 0) {
+		n = RTE_MAX(nb_rxd, nb_txd);
+		n = n + n / 2;
+		prm->qsize = n;
+	}
+
+	return 0;
+}
+
+static void print_wqp_param(const struct l3fwd_wqp_param *prm)
+{
+	printf("%s(%p): mode=%d, qsize=%u, single=%d\n",
+		__func__, prm, prm->mode, prm->qsize, prm->single);
+}
+
+int
+l3fwd_wqp_init(struct lcore_conf lc[RTE_MAX_LCORE],
+	const struct l3fwd_wqp_param *qprm)
+{
+	int32_t rc;
+	uint32_t i, j, k, lcid, m, n, nrxq, nwqt;
+	union lcore_wq *wqp;
+	struct l3fwd_wqp_param prm;
+
+	if (qprm->mode == L3FWD_WORKER_POLL)
+		return 0;
+
+	prm = *qprm;
+	rc = check_set_wqp_param(&prm);
+	print_wqp_param(&prm);
+	if (rc < 0) {
+		printf("error: %s invalid paramer values\n", __func__);
+		return rc;
+	}
+
+	nrxq = 0;
+	nwqt = 0;
+	for (lcid = 0; lcid != RTE_MAX_LCORE; lcid++) {
+		if (rte_lcore_is_enabled(lcid) == 0)
+			continue;
+		if (lc[lcid].n_rx_queue != 0)
+			nrxq += (prm.single != 0) ? 1 : lc[lcid].n_rx_queue;
+		nwqt += (lc[lcid].n_rx_queue == 0);
+	}
+
+	printf("%s: total worker queues: %u, total WQ threads: %u\n",
+			__func__, nrxq, nwqt);
+	if (nrxq == 0)
+		return 0;
+
+	if (nrxq > nwqt * MAX_RX_QUEUE_PER_LCORE) {
+		printf("error: %s not enough WQ threads to handle all RXQs\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	for (lcid = 0; lcid != RTE_MAX_LCORE; lcid++) {
+		if (rte_lcore_is_enabled(lcid) == 0 || lc[lcid].n_rx_queue == 0)
+			continue;
+		rc = wqp_init(lc + lcid, &prm, lcid);
+		if (rc != 0)
+			break;
+	}
+	if (rc != 0)
+		return rc;
+
+	/* create a temp pool of all RX queues */
+	wqp = malloc(sizeof(wqp[0]) * nrxq);
+	if (wqp == NULL) {
+		l3fwd_wqp_fini(lc);
+		return -ENOMEM;
+	}
+
+	n = 0;
+	for (lcid = 0; lcid != RTE_MAX_LCORE; lcid++) {
+		memcpy(wqp + n, lc[lcid].wqpool.queue,
+			lc[lcid].wqpool.nb_queue * sizeof(wqp[0]));
+		n += lc[lcid].wqpool.nb_queue;
+	}
+
+	/* distribute them across all worker threads */
+	k = 0;
+	m = RTE_MIN(RTE_DIM(lc[lcid].wqpool.queue), n);
+	for (lcid = 0; lcid != RTE_MAX_LCORE; lcid++) {
+		if (rte_lcore_is_enabled(lcid) == 0 ||
+				lc[lcid].wqpool.nb_queue != 0)
+			continue;
+		j = k;
+		for (i = 0; i != m; i++) {
+			lc[lcid].wqpool.queue[i] = wqp[j];
+			j = (j + 1) % n;
+		}
+		lc[lcid].wqpool.nb_queue = i;
+		lc[lcid].wqpool.qmask = UINT32_MAX;
+		lc[lcid].wqpool.prm = prm;
+		k = j;
+	}
+
+	free(wqp);
+	return rc;
+}
diff --git a/examples/l3fwd/l3fwd_wqp.h b/examples/l3fwd/l3fwd_wqp.h
new file mode 100644
index 0000000000..9a6a7eca6c
--- /dev/null
+++ b/examples/l3fwd/l3fwd_wqp.h
@@ -0,0 +1,132 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Huawei Technologies Co., Ltd
+ */
+
+#ifndef L3FWD_WQP_H
+#define L3FWD_WQP_H
+
+#include <rte_soring.h>
+
+struct wqelm {
+	struct rte_mbuf *mb;
+	uint32_t rc;
+}  __rte_packed;
+
+
+void l3fwd_wqp_fini(struct lcore_conf lc[RTE_MAX_LCORE]);
+int l3fwd_wqp_init(struct lcore_conf lc[RTE_MAX_LCORE],
+	const struct l3fwd_wqp_param *prm);
+
+static inline uint32_t
+lcore_wq_submit(const struct lcore_wq_pool *wqp, uint32_t idx,
+	struct rte_mbuf * const pkts[MAX_PKT_BURST], uint32_t num)
+{
+	idx &= wqp->qmask;
+
+	if (wqp->prm.mode == L3FWD_WORKER_UNQUE) {
+		struct rte_ring *r = wqp->queue[idx].r[LCORE_WQ_IN];
+		return rte_ring_enqueue_burst(r, (void * const *)pkts, num,
+				NULL);
+	} else if (wqp->prm.mode == L3FWD_WORKER_ORQUE) {
+		struct rte_soring *sor = wqp->queue[idx].sor;
+		return rte_soring_enqueue(sor, pkts, NULL, num,
+			RTE_RING_QUEUE_VARIABLE, NULL);
+	}
+
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+static inline uint32_t
+lcore_wq_receive(const struct lcore_wq_pool *wqp, uint32_t idx,
+	struct rte_mbuf *pkts[MAX_PKT_BURST], uint16_t hops[MAX_PKT_BURST],
+	uint32_t num)
+{
+	uint32_t i, n;
+	uint32_t rcs[MAX_PKT_BURST];
+	struct wqelm elm[MAX_PKT_BURST];
+
+	idx &= wqp->qmask;
+
+	if (wqp->prm.mode == L3FWD_WORKER_UNQUE) {
+		struct rte_ring *r = wqp->queue[idx].r[LCORE_WQ_OUT];
+
+		n = rte_ring_dequeue_burst_elem(r, elm, sizeof(elm[0]), num,
+				NULL);
+		for (i = 0; i != n; i++) {
+			pkts[i] = elm[i].mb;
+			hops[i] = elm[i].rc;
+		}
+
+		return n;
+
+	} else if (wqp->prm.mode == L3FWD_WORKER_ORQUE) {
+		struct rte_soring *sor = wqp->queue[idx].sor;
+
+		n = rte_soring_dequeue(sor, pkts, rcs, num,
+				RTE_RING_QUEUE_VARIABLE, NULL);
+		for (i = 0; i != n; i++)
+			hops[i] = rcs[i];
+
+		return n;
+	}
+
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+static inline uint32_t
+lcore_wq_pull(struct lcore_wq_pool *wqp, uint32_t idx,
+	struct rte_mbuf *pkts[MAX_PKT_BURST], uint32_t num)
+{
+	idx &= wqp->qmask;
+
+	if (wqp->prm.mode == L3FWD_WORKER_UNQUE) {
+		struct rte_ring *r = wqp->queue[idx].r[LCORE_WQ_IN];
+		return rte_ring_dequeue_burst(r, (void **)pkts, num, NULL);
+
+	} else if (wqp->prm.mode == L3FWD_WORKER_ORQUE) {
+		struct rte_soring *sor = wqp->queue[idx].sor;
+		return rte_soring_acquire(sor, pkts, NULL, 0, num,
+			RTE_RING_QUEUE_VARIABLE, &wqp->queue[idx].ftoken, NULL);
+	}
+
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+static inline uint32_t
+lcore_wq_push(const struct lcore_wq_pool *wqp, uint32_t idx,
+	struct rte_mbuf * const pkts[MAX_PKT_BURST],
+	const uint16_t hops[MAX_PKT_BURST], uint32_t num)
+{
+	uint32_t i;
+	uint32_t rcs[MAX_PKT_BURST];
+	struct wqelm elm[MAX_PKT_BURST];
+
+	idx &= wqp->qmask;
+
+	if (wqp->prm.mode == L3FWD_WORKER_UNQUE) {
+		struct rte_ring *r = wqp->queue[idx].r[LCORE_WQ_OUT];
+
+		for (i = 0; i != num; i++) {
+			elm[i].mb = pkts[i];
+			elm[i].rc = hops[i];
+		}
+		return rte_ring_enqueue_burst_elem(r, elm, sizeof(elm[0]), num,
+				NULL);
+	} else if (wqp->prm.mode == L3FWD_WORKER_ORQUE) {
+		struct rte_soring *sor = wqp->queue[idx].sor;
+
+		for (i = 0; i != num; i++)
+			rcs[i] = hops[i];
+		rte_soring_release(sor, NULL, rcs, 0, num,
+			wqp->queue[idx].ftoken);
+		return num;
+	}
+
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+#endif /* L3FWD_WQP_H */
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 01b763e5ba..5d2bc5c611 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -47,6 +47,7 @@ 
 #include "l3fwd.h"
 #include "l3fwd_event.h"
 #include "l3fwd_route.h"
+#include "l3fwd_wqp.h"
 
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_LCORE
 #define MAX_RX_QUEUE_PER_PORT 128
@@ -69,6 +70,10 @@  enum L3FWD_LOOKUP_MODE {
 };
 static enum L3FWD_LOOKUP_MODE lookup_mode;
 
+struct l3fwd_wqp_param l3fwd_wqp_param = {
+	.mode = L3FWD_WORKER_POLL,
+};
+
 /* Global variables. */
 static int numa_on = 1; /**< NUMA is enabled by default. */
 static int parse_ptype; /**< Parse packet type using rx callback, and */
@@ -246,6 +251,8 @@  const struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = {
 	{{32, 1, 2, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0}, 64, 15},
 };
 
+uint32_t l3fwd_lookup_iter_num = 1;
+
 /*
  * API's called during initialization to setup ACL/EM/LPM rules.
  */
@@ -453,6 +460,23 @@  print_usage(const char *prgname)
 		ACL_LEAD_CHAR, ROUTE_LEAD_CHAR, alg);
 }
 
+static int
+parse_uint_val(const char *str, uint32_t *val, uint32_t min, uint32_t max)
+{
+	char *end = NULL;
+	unsigned long v;
+
+	errno = 0;
+	v = strtoul(str, &end, 0);
+	if (errno != 0 || end == NULL || *end != '\0')
+		return -1;
+	if (v < min || v > max)
+		return -1;
+
+	*val = v;
+	return 0;
+}
+
 static int
 parse_max_pkt_len(const char *pktlen)
 {
@@ -572,16 +596,35 @@  parse_eth_dest(const char *optarg)
 }
 
 static void
-parse_mode(const char *optarg __rte_unused)
+parse_mode(const char *optarg)
 {
+	l3fwd_wqp_param.mode = L3FWD_WORKER_POLL;
+
 #ifdef RTE_LIB_EVENTDEV
 	struct l3fwd_event_resources *evt_rsrc = l3fwd_get_eventdev_rsrc();
 
+	evt_rsrc->enabled = false;
+
 	if (!strcmp(optarg, "poll"))
 		evt_rsrc->enabled = false;
 	else if (!strcmp(optarg, "eventdev"))
 		evt_rsrc->enabled = true;
+	else
 #endif
+	if (strcmp(optarg, "wqorder") == 0) {
+		l3fwd_wqp_param.mode = L3FWD_WORKER_ORQUE;
+		l3fwd_wqp_param.single = 0;
+	} else if (strcmp(optarg, "wqunorder") == 0) {
+		l3fwd_wqp_param.mode = L3FWD_WORKER_UNQUE;
+		l3fwd_wqp_param.single = 0;
+	} else if (strcmp(optarg, "wqorderS") == 0) {
+		l3fwd_wqp_param.mode = L3FWD_WORKER_ORQUE;
+		l3fwd_wqp_param.single = 1;
+	} else if (strcmp(optarg, "wqunorderS") == 0) {
+		l3fwd_wqp_param.mode = L3FWD_WORKER_UNQUE;
+		l3fwd_wqp_param.single = 1;
+	} else
+		rte_exit(EXIT_FAILURE, "unknown mode: %s\n", optarg);
 }
 
 static void
@@ -698,6 +741,8 @@  static const char short_options[] =
 #define CMD_LINE_OPT_RULE_IPV4 "rule_ipv4"
 #define CMD_LINE_OPT_RULE_IPV6 "rule_ipv6"
 #define CMD_LINE_OPT_ALG "alg"
+#define CMD_LINE_OPT_WQSIZE "wqsize"
+#define CMD_LINE_OPT_LOOKUP_ITER "lookup-iter"
 
 enum {
 	/* long options mapped to a short option */
@@ -726,7 +771,9 @@  enum {
 	CMD_LINE_OPT_LOOKUP_NUM,
 	CMD_LINE_OPT_ENABLE_VECTOR_NUM,
 	CMD_LINE_OPT_VECTOR_SIZE_NUM,
-	CMD_LINE_OPT_VECTOR_TMO_NS_NUM
+	CMD_LINE_OPT_VECTOR_TMO_NS_NUM,
+	CMD_LINE_OPT_WQSIZE_NUM,
+	CMD_LINE_OPT_LOOKUP_ITER_NUM,
 };
 
 static const struct option lgopts[] = {
@@ -753,6 +800,8 @@  static const struct option lgopts[] = {
 	{CMD_LINE_OPT_RULE_IPV4,   1, 0, CMD_LINE_OPT_RULE_IPV4_NUM},
 	{CMD_LINE_OPT_RULE_IPV6,   1, 0, CMD_LINE_OPT_RULE_IPV6_NUM},
 	{CMD_LINE_OPT_ALG,   1, 0, CMD_LINE_OPT_ALG_NUM},
+	{CMD_LINE_OPT_WQSIZE, 1, 0, CMD_LINE_OPT_WQSIZE_NUM},
+	{CMD_LINE_OPT_LOOKUP_ITER, 1, 0, CMD_LINE_OPT_LOOKUP_ITER_NUM},
 	{NULL, 0, 0, 0}
 };
 
@@ -934,6 +983,18 @@  parse_args(int argc, char **argv)
 		case CMD_LINE_OPT_ALG_NUM:
 			l3fwd_set_alg(optarg);
 			break;
+		case CMD_LINE_OPT_WQSIZE_NUM:
+			ret = parse_uint_val(optarg, &l3fwd_wqp_param.qsize,
+				RX_DESC_DEFAULT, UINT16_MAX);
+			if (ret < 0)
+				return ret;
+			break;
+		case CMD_LINE_OPT_LOOKUP_ITER_NUM:
+			ret = parse_uint_val(optarg, &l3fwd_lookup_iter_num,
+				1, UINT16_MAX);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			print_usage(prgname);
 			return -1;
@@ -1588,6 +1649,8 @@  main(int argc, char **argv)
 			l3fwd_lkp.main_loop = evt_rsrc->ops.em_event_loop;
 		else if (lookup_mode == L3FWD_LOOKUP_FIB)
 			l3fwd_lkp.main_loop = evt_rsrc->ops.fib_event_loop;
+		else if (lookup_mode == L3FWD_LOOKUP_ACL)
+			l3fwd_lkp.main_loop = evt_rsrc->ops.acl_event_loop;
 		else
 			l3fwd_lkp.main_loop = evt_rsrc->ops.lpm_event_loop;
 	} else
@@ -1640,6 +1703,12 @@  main(int argc, char **argv)
 		}
 	}
 
+	/* init worker queues for lcores (if any) */
+	ret = l3fwd_wqp_init(lcore_conf, &l3fwd_wqp_param);
+	if (ret != 0)
+		rte_exit(EXIT_FAILURE, "l3fwd_wqp_init: err=%d, lcore=%u\n",
+			ret, lcore_id);
+
 	check_all_ports_link_status(enabled_port_mask);
 
 	ret = 0;
@@ -1695,6 +1764,8 @@  main(int argc, char **argv)
 	/* clean up config file routes */
 	l3fwd_lkp.free_routes();
 
+	l3fwd_wqp_fini(lcore_conf);
+
 	/* clean up the EAL */
 	rte_eal_cleanup();
 
diff --git a/examples/l3fwd/meson.build b/examples/l3fwd/meson.build
index c25de77bba..a024492fb1 100644
--- a/examples/l3fwd/meson.build
+++ b/examples/l3fwd/meson.build
@@ -16,6 +16,7 @@  sources = files(
         'l3fwd_event_generic.c',
         'l3fwd_fib.c',
         'l3fwd_lpm.c',
+        'l3fwd_wqp.c',
         'main.c',
 )
 if dpdk_conf.has('RTE_LIB_EVENTDEV')