[v4] net/netvsc: fix number Tx queues > Rx queues
Checks
Commit Message
The previous code allowed the number of Tx queues to be set higher than the number of Rx queues. If a packet was sent on a Tx queue with index
>= number Rx queues there was a segfault due to accessing beyond the end of the dev->data->rx_queues[] array.
#0 rte_spinlock_trylock (sl = invalid address) at /include/rte_spinlock.h L63
#1 hn_process_events at /drivers/net/netvsc/hn_rxtx.c L 1129
#2 hn_xmit_pkts at /drivers/net/netvsc/hn_rxtx.c L1553
This commit fixes the issue by creating an Rx queue for every Tx queue meaning that an event buffer is allocated to handle receiving Tx completion messages.
mbuf pool and Rx ring are not allocated for these additional Rx queues and RSS configuration ensures that no packets are received on them.
Fixes: 4e9c73e96e83 ("net/netvsc: add Hyper-V network device")
Cc: sthemmin@microsoft.com
Cc: stable@dpdk.org
Signed-off-by: Alan Elder <alan.elder@microsoft.com>
---
v4:
* Include segfault core stack in commit message
v3:
* Handle case of Rx queue creation failure in hn_dev_tx_queue_setup.
* Re-use rx queue if it has already been allocated.
* Don't allocate an mbuf if pool is NULL. This avoids segfault if RSS
configuration is incorrect.
v2:
* Remove function declaration for static non-member function
---
drivers/net/netvsc/hn_ethdev.c | 9 +++++
drivers/net/netvsc/hn_rxtx.c | 70 +++++++++++++++++++++++++++++-----
2 files changed, 70 insertions(+), 9 deletions(-)
struct hn_tx_queue *txq;
+ struct hn_rx_queue *rxq = NULL;
char name[RTE_MEMPOOL_NAMESIZE];
uint32_t tx_free_thresh;
int err = -ENOMEM;
@@ -301,6 +313,27 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
goto error;
}
+ /*
+ * If there are more Tx queues than Rx queues, allocate rx_queues
+ * with event buffer so that Tx completion messages can still be
+ * received
+ */
+ if (queue_idx >= dev->data->nb_rx_queues) {
+ rxq = hn_rx_queue_alloc(hv, queue_idx, socket_id);
+
+ if (!rxq) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ /*
+ * Don't allocate mbuf pool or rx ring. RSS is always configured
+ * to ensure packets aren't received by this Rx queue.
+ */
+ rxq->mb_pool = NULL;
+ rxq->rx_ring = NULL;
+ }
+
txq->agg_szmax = RTE_MIN(hv->chim_szmax, hv->rndis_agg_size);
txq->agg_pktmax = hv->rndis_agg_pkts;
txq->agg_align = hv->rndis_agg_align; @@ -311,12 +344,15 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
socket_id, tx_conf);
if (err == 0) {
dev->data->tx_queues[queue_idx] = txq;
+ if (rxq != NULL)
+ dev->data->rx_queues[queue_idx] = rxq;
return 0;
}
error:
rte_mempool_free(txq->txdesc_pool);
rte_memzone_free(txq->tx_rndis_mz);
+ hn_rx_queue_free_common(rxq);
rte_free(txq);
return err;
}
@@ -364,6 +400,13 @@ hn_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
if (!txq)
return;
+ /*
+ * Free any Rx queues allocated for a Tx queue without a corresponding
+ * Rx queue
+ */
+ if (qid >= dev->data->nb_rx_queues)
+ hn_rx_queue_free_common(dev->data->rx_queues[qid]);
+
rte_mempool_free(txq->txdesc_pool);
rte_memzone_free(txq->tx_rndis_mz);
@@ -552,10 +595,12 @@ static void hn_rxpkt(struct hn_rx_queue *rxq, struct hn_rx_bufinfo *rxb,
const struct hn_rxinfo *info)
{
struct hn_data *hv = rxq->hv;
- struct rte_mbuf *m;
+ struct rte_mbuf *m = NULL;
bool use_extbuf = false;
- m = rte_pktmbuf_alloc(rxq->mb_pool);
+ if (likely(rxq->mb_pool != NULL))
+ m = rte_pktmbuf_alloc(rxq->mb_pool);
+
if (unlikely(!m)) {
struct rte_eth_dev *dev =
&rte_eth_devices[rxq->port_id];
@@ -942,7 +987,15 @@ hn_dev_rx_queue_setup(struct rte_eth_dev *dev,
if (queue_idx == 0) {
rxq = hv->primary;
} else {
- rxq = hn_rx_queue_alloc(hv, queue_idx, socket_id);
+ /*
+ * If the number of Tx queues was previously greater than the
+ * number of Rx queues, we may already have allocated an rxq.
+ */
+ if (!dev->data->rx_queues[queue_idx])
+ rxq = hn_rx_queue_alloc(hv, queue_idx, socket_id);
+ else
+ rxq = dev->data->rx_queues[queue_idx];
+
if (!rxq)
return -ENOMEM;
}
@@ -975,9 +1028,10 @@ hn_dev_rx_queue_setup(struct rte_eth_dev *dev,
fail:
rte_ring_free(rxq->rx_ring);
- rte_free(rxq->rxbuf_info);
- rte_free(rxq->event_buf);
- rte_free(rxq);
+ /* Only free rxq if it was created in this function. */
+ if (!dev->data->rx_queues[queue_idx])
+ hn_rx_queue_free_common(rxq);
+
return error;
}
@@ -998,9 +1052,7 @@ hn_rx_queue_free(struct hn_rx_queue *rxq, bool keep_primary)
if (keep_primary && rxq == rxq->hv->primary)
return;
- rte_free(rxq->rxbuf_info);
- rte_free(rxq->event_buf);
- rte_free(rxq);
+ hn_rx_queue_free_common(rxq);
}
void
--
2.25.1
Comments
On 4/15/2024 3:40 PM, Alan Elder wrote:
> The previous code allowed the number of Tx queues to be set higher than the number of Rx queues. If a packet was sent on a Tx queue with index
>> = number Rx queues there was a segfault due to accessing beyond the end of the dev->data->rx_queues[] array.
>
> #0 rte_spinlock_trylock (sl = invalid address) at /include/rte_spinlock.h L63
> #1 hn_process_events at /drivers/net/netvsc/hn_rxtx.c L 1129
> #2 hn_xmit_pkts at /drivers/net/netvsc/hn_rxtx.c L1553
>
> This commit fixes the issue by creating an Rx queue for every Tx queue meaning that an event buffer is allocated to handle receiving Tx completion messages.
>
> mbuf pool and Rx ring are not allocated for these additional Rx queues and RSS configuration ensures that no packets are received on them.
>
> Fixes: 4e9c73e96e83 ("net/netvsc: add Hyper-V network device")
> Cc: sthemmin@microsoft.com
> Cc: stable@dpdk.org
>
> Signed-off-by: Alan Elder <alan.elder@microsoft.com>
<...>
> @@ -552,10 +595,12 @@ static void hn_rxpkt(struct hn_rx_queue *rxq, struct hn_rx_bufinfo *rxb,
> const struct hn_rxinfo *info)
> {
> struct hn_data *hv = rxq->hv;
> - struct rte_mbuf *m;
> + struct rte_mbuf *m = NULL;
> bool use_extbuf = false;
>
> - m = rte_pktmbuf_alloc(rxq->mb_pool);
> + if (likely(rxq->mb_pool != NULL))
> + m = rte_pktmbuf_alloc(rxq->mb_pool);
> +
>
This introduced additional check in Rx path, not sure what is the
performance impact.
I can see Long already acked the v3, I just want to double check.
If Tx queue number > Rx queue number is not a common usecase, perhaps it
can be an option to forbid it instead of getting performance hit.
Or it can be possible to have a dedicated Rx queue, like queue_id 0, for
Tx completion events for Tx queue_id > Rx queue number, etc..
But Long if you prefer to continue with this patch, please ack it and I
can continue with it.
> This introduced additional check in Rx path, not sure what is the performance
> impact.
>
> I can see Long already acked the v3, I just want to double check.
> If Tx queue number > Rx queue number is not a common usecase, perhaps it can
> be an option to forbid it instead of getting performance hit.
> Or it can be possible to have a dedicated Rx queue, like queue_id 0, for Tx
> completion events for Tx queue_id > Rx queue number, etc..
>
> But Long if you prefer to continue with this patch, please ack it and I can continue
> with it.
Ferruh, thank you for raising this concern. We will run some tests to evaluate performance impact of this patch.
Will update soon.
Long
> From: Alan Elder [mailto:alan.elder@microsoft.com]
> Sent: Monday, 15 April 2024 16.41
>
> The previous code allowed the number of Tx queues to be set higher than the
> number of Rx queues. If a packet was sent on a Tx queue with index
> >= number Rx queues there was a segfault due to accessing beyond the end of
> the dev->data->rx_queues[] array.
>
> #0 rte_spinlock_trylock (sl = invalid address) at /include/rte_spinlock.h L63
> #1 hn_process_events at /drivers/net/netvsc/hn_rxtx.c L 1129
> #2 hn_xmit_pkts at /drivers/net/netvsc/hn_rxtx.c L1553
>
> This commit fixes the issue by creating an Rx queue for every Tx queue meaning
> that an event buffer is allocated to handle receiving Tx completion messages.
>
> mbuf pool and Rx ring are not allocated for these additional Rx queues and RSS
> configuration ensures that no packets are received on them.
>
> Fixes: 4e9c73e96e83 ("net/netvsc: add Hyper-V network device")
> Cc: sthemmin@microsoft.com
> Cc: stable@dpdk.org
>
> Signed-off-by: Alan Elder <alan.elder@microsoft.com>
> ---
Is there any requirements to the order the application must call rte_eth_rx_queue_setup() and rte_eth_tx_queue_setup()?
I.e. does it work if rte_eth_tx_queue_setup() is called before rte_eth_rx_queue_setup(), and in the opposite order?
Although the ethdev documentation says:
"The functions exported by the application Ethernet API to setup a device designated by its port identifier must be invoked in the following order:
rte_eth_dev_configure()
rte_eth_tx_queue_setup()
rte_eth_rx_queue_setup()
rte_eth_dev_start()",
I would assume the order of calling rte_eth_tx_queue_setup() and rte_eth_rx_queue_setup() doesn't matter.
And the rte_eth_dev_reset() function documentation has rx/tx queue setup in the opposite order:
"After calling rte_eth_dev_reset(), the application should use rte_eth_dev_configure(), rte_eth_rx_queue_setup(), rte_eth_tx_queue_setup(), and rte_eth_dev_start() to reconfigure the device as appropriate."
On 5/1/2024 8:43 AM, Morten Brørup wrote:
>> From: Alan Elder [mailto:alan.elder@microsoft.com]
>> Sent: Monday, 15 April 2024 16.41
>>
>> The previous code allowed the number of Tx queues to be set higher than the
>> number of Rx queues. If a packet was sent on a Tx queue with index
>>> = number Rx queues there was a segfault due to accessing beyond the end of
>> the dev->data->rx_queues[] array.
>>
>> #0 rte_spinlock_trylock (sl = invalid address) at /include/rte_spinlock.h L63
>> #1 hn_process_events at /drivers/net/netvsc/hn_rxtx.c L 1129
>> #2 hn_xmit_pkts at /drivers/net/netvsc/hn_rxtx.c L1553
>>
>> This commit fixes the issue by creating an Rx queue for every Tx queue meaning
>> that an event buffer is allocated to handle receiving Tx completion messages.
>>
>> mbuf pool and Rx ring are not allocated for these additional Rx queues and RSS
>> configuration ensures that no packets are received on them.
>>
>> Fixes: 4e9c73e96e83 ("net/netvsc: add Hyper-V network device")
>> Cc: sthemmin@microsoft.com
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Alan Elder <alan.elder@microsoft.com>
>> ---
>
> Is there any requirements to the order the application must call rte_eth_rx_queue_setup() and rte_eth_tx_queue_setup()?
>
> I.e. does it work if rte_eth_tx_queue_setup() is called before rte_eth_rx_queue_setup(), and in the opposite order?
>
>
> Although the ethdev documentation says:
>
> "The functions exported by the application Ethernet API to setup a device designated by its port identifier must be invoked in the following order:
>
> rte_eth_dev_configure()
> rte_eth_tx_queue_setup()
> rte_eth_rx_queue_setup()
> rte_eth_dev_start()",
>
> I would assume the order of calling rte_eth_tx_queue_setup() and rte_eth_rx_queue_setup() doesn't matter.
>
Same, I am not aware of any strict ordering requirement for Rx and Tx
queues setup.
>
> And the rte_eth_dev_reset() function documentation has rx/tx queue setup in the opposite order:
>
> "After calling rte_eth_dev_reset(), the application should use rte_eth_dev_configure(), rte_eth_rx_queue_setup(), rte_eth_tx_queue_setup(), and rte_eth_dev_start() to reconfigure the device as appropriate."
>
@@ -313,6 +313,15 @@ static int hn_rss_reta_update(struct rte_eth_dev *dev,
if (reta_conf[idx].mask & mask)
hv->rss_ind[i] = reta_conf[idx].reta[shift];
+
+ /*
+ * Ensure we don't allow config that directs traffic to an Rx
+ * queue that we aren't going to poll
+ */
+ if (hv->rss_ind[i] >= dev->data->nb_rx_queues) {
+ PMD_DRV_LOG(ERR, "RSS distributing traffic to invalid Rx queue");
+ return -EINVAL;
+ }
}
err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE); diff --git a/drivers/net/netvsc/hn_rxtx.c b/drivers/net/netvsc/hn_rxtx.c index 9bf1ec5509..e23880c176 100644
@@ -234,6 +234,17 @@ static void hn_reset_txagg(struct hn_tx_queue *txq)
txq->agg_prevpkt = NULL;
}
+static void
+hn_rx_queue_free_common(struct hn_rx_queue *rxq) {
+ if (!rxq)
+ return;
+
+ rte_free(rxq->rxbuf_info);
+ rte_free(rxq->event_buf);
+ rte_free(rxq);
+}
+
int
hn_dev_tx_queue_setup(struct rte_eth_dev *dev,
uint16_t queue_idx, uint16_t nb_desc, @@ -243,6 +254,7 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev, {
struct hn_data *hv = dev->data->dev_private;