diff mbox series

[v13,1/7] ethdev: introduce shared Rx queue

Message ID 20211021104142.2649060-2-xuemingl@nvidia.com (mailing list archive)
State Accepted, archived
Delegated to: Ferruh Yigit
Headers show
Series ethdev: introduce shared Rx queue | expand


Context Check Description
ci/iol-testing warning apply patch failure
ci/checkpatch success coding style OK

Commit Message

Xueming Li Oct. 21, 2021, 10:41 a.m. UTC
In current DPDK framework, each Rx queue is pre-loaded with mbufs to
save incoming packets. For some PMDs, when number of representors scale
out in a switch domain, the memory consumption became significant.
Polling all ports also leads to high cache miss, high latency and low

This patch introduces shared Rx queue. Ports in same Rx domain and
switch domain could share Rx queue set by specifying non-zero sharing
group in Rx queue configuration.

Shared Rx queue is identified by share_rxq field of Rx queue
configuration. Port A RxQ X can share RxQ with Port B RxQ Y by using
same shared Rx queue ID.

No special API is defined to receive packets from shared Rx queue.
Polling any member port of a shared Rx queue receives packets of that
queue for all member ports, port_id is identified by mbuf->port. PMD is
responsible to resolve shared Rx queue from device and queue data.

Shared Rx queue must be polled in same thread or core, polling a queue
ID of any member port is essentially same.

Multiple share groups are supported. PMD should support mixed
configuration by allowing multiple share groups and non-shared Rx queue
on one port.

Example grouping and polling model to reflect service priority:
 Group1, 2 shared Rx queues per port: PF, rep0, rep1
 Group2, 1 shared Rx queue per port: rep2, rep3, ... rep127
 Core0: poll PF queue0
 Core1: poll PF queue1
 Core2: poll rep2 queue0

PMD advertise shared Rx queue capability via RTE_ETH_DEV_CAPA_RXQ_SHARE.

PMD is responsible for shared Rx queue consistency checks to avoid
member port's configuration contradict each other.

Signed-off-by: Xueming Li <xuemingl@nvidia.com>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
Acked-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
 doc/guides/nics/features.rst                  | 13 ++++++++++
 doc/guides/nics/features/default.ini          |  1 +
 .../prog_guide/switch_representation.rst      | 11 +++++++++
 doc/guides/rel_notes/release_21_11.rst        |  6 +++++
 lib/ethdev/rte_ethdev.c                       |  8 +++++++
 lib/ethdev/rte_ethdev.h                       | 24 +++++++++++++++++++
 6 files changed, 63 insertions(+)
diff mbox series


diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 8dd421ca013..d35751d5b5a 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -614,6 +614,19 @@  Supports inner packet L4 checksum.
+.. _nic_features_shared_rx_queue:
+Shared Rx queue
+Supports shared Rx queue for ports in same Rx domain of a switch domain.
+* **[uses]     rte_eth_dev_info**: ``dev_capa:RTE_ETH_DEV_CAPA_RXQ_SHARE``.
+* **[uses]     rte_eth_dev_info,rte_eth_switch_info**: ``rx_domain``, ``domain_id``.
+* **[uses]     rte_eth_rxconf**: ``share_group``, ``share_qid``.
+* **[provides] mbuf**: ``mbuf.port``.
 .. _nic_features_packet_type_parsing:
 Packet type parsing
diff --git a/doc/guides/nics/features/default.ini b/doc/guides/nics/features/default.ini
index 09914b1ad32..39d21fcd379 100644
--- a/doc/guides/nics/features/default.ini
+++ b/doc/guides/nics/features/default.ini
@@ -19,6 +19,7 @@  Free Tx mbuf on demand =
 Queue start/stop     =
 Runtime Rx queue setup =
 Runtime Tx queue setup =
+Shared Rx queue      =
 Burst mode info      =
 Power mgmt address monitor =
 MTU update           =
diff --git a/doc/guides/prog_guide/switch_representation.rst b/doc/guides/prog_guide/switch_representation.rst
index ff6aa91c806..4f2532a91ea 100644
--- a/doc/guides/prog_guide/switch_representation.rst
+++ b/doc/guides/prog_guide/switch_representation.rst
@@ -123,6 +123,17 @@  thought as a software "patch panel" front-end for applications.
 .. [1] `Ethernet switch device driver model (switchdev)
+- For some PMDs, memory usage of representors is huge when number of
+  representor grows, mbufs are allocated for each descriptor of Rx queue.
+  Polling large number of ports brings more CPU load, cache miss and
+  latency. Shared Rx queue can be used to share Rx queue between PF and
+  representors among same Rx domain. ``RTE_ETH_DEV_CAPA_RXQ_SHARE`` in
+  device info is used to indicate the capability. Setting non-zero share
+  group in Rx queue configuration to enable share, share_qid is used to
+  identify the shared Rx queue in group. Polling any member port can
+  receive packets of all member ports in the group, port ID is saved in
+  ``mbuf.port``.
 Basic SR-IOV
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 74776ca0691..f4fb68e7408 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -75,6 +75,12 @@  New Features
   * Added multi-process support.
+* **Added ethdev shared Rx queue support.**
+  * Added new device capability flag and Rx domain field to switch info.
+  * Added share group and share queue ID to Rx queue configuration.
+  * Added testpmd support and dedicate forwarding engine.
 * **Added support to get all MAC addresses of a device.**
   Added ``rte_eth_macaddrs_get`` to allow user to retrieve all Ethernet
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 1f18aa916cc..31a9cba065b 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -2175,6 +2175,14 @@  rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		return -EINVAL;
+	if (local_conf.share_group > 0 &&
+	    (dev_info.dev_capa & RTE_ETH_DEV_CAPA_RXQ_SHARE) == 0) {
+			"Ethdev port_id=%d rx_queue_id=%d, enabled share_group=%hu while device doesn't support Rx queue share\n",
+			port_id, rx_queue_id, local_conf.share_group);
+		return -EINVAL;
+	}
 	 * If LRO is enabled, check that the maximum aggregated packet
 	 * size is supported by the configured device.
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index 014270d3167..40f88cc3d64 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -1045,6 +1045,14 @@  struct rte_eth_rxconf {
 	uint8_t rx_drop_en; /**< Drop packets if no descriptors are available. */
 	uint8_t rx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */
 	uint16_t rx_nseg; /**< Number of descriptions in rx_seg array. */
+	/**
+	 * Share group index in Rx domain and switch domain.
+	 * Non-zero value to enable Rx queue share, zero value disable share.
+	 * PMD is responsible for Rx queue consistency checks to avoid member
+	 * port's configuration contradict to each other.
+	 */
+	uint16_t share_group;
+	uint16_t share_qid; /**< Shared Rx queue ID in group. */
 	 * Per-queue Rx offloads to be set using DEV_RX_OFFLOAD_* flags.
 	 * Only offloads set on rx_queue_offload_capa or rx_offload_capa
@@ -1445,6 +1453,16 @@  struct rte_eth_conf {
 /** Device supports Tx queue setup after device started. */
+ * Device supports shared Rx queue among ports within Rx domain and
+ * switch domain. Mbufs are consumed by shared Rx queue instead of
+ * each queue. Multiple groups are supported by share_group of Rx
+ * queue configuration. Shared Rx queue is identified by PMD using
+ * share_qid of Rx queue configuration. Polling any port in the group
+ * receive packets of all member ports, source port identified by
+ * mbuf->port field.
+ */
+#define RTE_ETH_DEV_CAPA_RXQ_SHARE              RTE_BIT64(2)
@@ -1488,6 +1506,12 @@  struct rte_eth_switch_info {
 	 * but each driver should explicitly define the mapping of switch
 	 * port identifier to that physical interconnect/switch
+	/**
+	 * Shared Rx queue sub-domain boundary. Only ports in same Rx domain
+	 * and switch domain can share Rx queue. Valid only if device advertised
+	 * RTE_ETH_DEV_CAPA_RXQ_SHARE capability.
+	 */
+	uint16_t rx_domain;