[1/2] vhost: fix memory leak in Virtio Tx split path

Message ID 20240131093113.2208894-1-maxime.coquelin@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series [1/2] vhost: fix memory leak in Virtio Tx split path |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-testing warning apply patch failure

Commit Message

Maxime Coquelin Jan. 31, 2024, 9:31 a.m. UTC
  When vIOMMU is enabled and Virtio device is bound to kernel
driver in guest, rte_vhost_dequeue_burst() will often return
early because of IOTLB misses.

This patch fixes a mbuf leak occurring in this case.

Fixes: 242695f6122a ("vhost: allocate and free packets in bulk in Tx split")
Cc: stable@dpdk.org

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/vhost/virtio_net.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)
  

Comments

David Marchand Jan. 31, 2024, 1:19 p.m. UTC | #1
On Wed, Jan 31, 2024 at 10:31 AM Maxime Coquelin
<maxime.coquelin@redhat.com> wrote:
>
> When vIOMMU is enabled and Virtio device is bound to kernel
> driver in guest, rte_vhost_dequeue_burst() will often return
> early because of IOTLB misses.
>
> This patch fixes a mbuf leak occurring in this case.
>
> Fixes: 242695f6122a ("vhost: allocate and free packets in bulk in Tx split")
> Cc: stable@dpdk.org
>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  lib/vhost/virtio_net.c | 20 ++++++++------------
>  1 file changed, 8 insertions(+), 12 deletions(-)
>
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 280d4845f8..db9985c9b9 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -3120,11 +3120,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                                                 VHOST_ACCESS_RO) < 0))
>                         break;
>
> -               c(vq, head_idx, 0);
> -
>                 if (unlikely(buf_len <= dev->vhost_hlen)) {
> -                       dropped += 1;
> -                       i++;
> +                       dropped = 1;
>                         break;
>                 }

``i`` was used for both filling the returned mbuf array, but also to
update the shadow_used_idx / array.

So this change here also affects how the currently considered
descriptor is returned through the used ring.

See below...

>
> @@ -3143,8 +3140,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                                         buf_len, mbuf_pool->name);
>                                 allocerr_warned = true;
>                         }
> -                       dropped += 1;
> -                       i++;
> +                       dropped = 1;
>                         break;
>                 }
>
> @@ -3155,17 +3151,17 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                                 VHOST_DATA_LOG(dev->ifname, ERR, "failed to copy desc to mbuf.");
>                                 allocerr_warned = true;
>                         }
> -                       dropped += 1;
> -                       i++;
> +                       dropped = 1;
>                         break;
>                 }
>
> +               update_shadow_used_ring_split(vq, head_idx, 0);
>         }
>
> -       if (dropped)
> -               rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
> +       if (unlikely(count != i))
> +               rte_pktmbuf_free_bulk(&pkts[i], count - i);
>
> -       vq->last_avail_idx += i;
> +       vq->last_avail_idx += i + dropped;
>
>         do_data_copy_dequeue(vq);
>         if (unlikely(i < count))

... I am copying the rest of the context:
       if (unlikely(i < count))
               vq->shadow_used_idx = i;

Before the patch, when breaking and doing the i++ stuff,
vq->shadow_used_idx was probably already equal to i because
update_shadow_used_ring_split had been called earlier.
Because of this, the "dropped" descriptor was part of the shadow_used
array for returning it through the used ring.

With the patch, since we break without touching i, it means that the
"dropped" descriptor is not returned anymore.

Fixing this issue could take the form of restoring the call to
update_shadow_used_ring_split in the loop and adjust
vq->shadow_used_idx to i + dropped.

But I think we can go one step further, by noting that
vq->last_avail_idx is being incremented by the same "i + dropped"
value.
Meaning that we could simply rely on calls to
update_shadow_used_ring_split and reuse vq->shadow_used_idx to adjust
vq->last_avail_idx.
By doing this, there is no need for a "dropped" variable, and no need
for touching of vq->shadow_used_idx manually, which is probably better
for robustness / easing readability.


Note: we could also move the call do_data_copy_dequeue() under the
check on vq->shadow_used_idx != 0, though it won't probably change
anything.


This gives the following (untested) diff, on top of your fix:
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 211d24b36a..4e1d61bd54 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3104,7 +3104,6 @@ virtio_dev_tx_split(struct virtio_net *dev,
struct vhost_virtqueue *vq,
 {
        uint16_t i;
        uint16_t avail_entries;
-       uint16_t dropped = 0;
        static bool allocerr_warned;

        /*
@@ -3141,10 +3140,10 @@ virtio_dev_tx_split(struct virtio_net *dev,
struct vhost_virtqueue *vq,
                                                VHOST_ACCESS_RO) < 0))
                        break;

-               if (unlikely(buf_len <= dev->vhost_hlen)) {
-                       dropped = 1;
+               update_shadow_used_ring_split(vq, head_idx, 0);
+
+               if (unlikely(buf_len <= dev->vhost_hlen))
                        break;
-               }

                buf_len -= dev->vhost_hlen;

@@ -3161,7 +3160,6 @@ virtio_dev_tx_split(struct virtio_net *dev,
struct vhost_virtqueue *vq,
                                        buf_len, mbuf_pool->name);
                                allocerr_warned = true;
                        }
-                       dropped = 1;
                        break;
                }

@@ -3172,22 +3170,16 @@ virtio_dev_tx_split(struct virtio_net *dev,
struct vhost_virtqueue *vq,
                                VHOST_DATA_LOG(dev->ifname, ERR,
"failed to copy desc to mbuf.");
                                allocerr_warned = true;
                        }
-                       dropped = 1;
                        break;
                }
-
-               update_shadow_used_ring_split(vq, head_idx, 0);
        }

        if (unlikely(count != i))
                rte_pktmbuf_free_bulk(&pkts[i], count - i);

-       vq->last_avail_idx += i + dropped;
-
        do_data_copy_dequeue(vq);
-       if (unlikely(i < count))
-               vq->shadow_used_idx = i;
        if (likely(vq->shadow_used_idx)) {
+               vq->last_avail_idx += vq->shadow_used_idx;
                flush_shadow_used_ring_split(dev, vq);
                vhost_vring_call_split(dev, vq);
        }
  
Maxime Coquelin Jan. 31, 2024, 1:32 p.m. UTC | #2
Hi David,

On 1/31/24 14:19, David Marchand wrote:
> On Wed, Jan 31, 2024 at 10:31 AM Maxime Coquelin
> <maxime.coquelin@redhat.com> wrote:
>>
>> When vIOMMU is enabled and Virtio device is bound to kernel
>> driver in guest, rte_vhost_dequeue_burst() will often return
>> early because of IOTLB misses.
>>
>> This patch fixes a mbuf leak occurring in this case.
>>
>> Fixes: 242695f6122a ("vhost: allocate and free packets in bulk in Tx split")
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>   lib/vhost/virtio_net.c | 20 ++++++++------------
>>   1 file changed, 8 insertions(+), 12 deletions(-)
>>
>> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
>> index 280d4845f8..db9985c9b9 100644
>> --- a/lib/vhost/virtio_net.c
>> +++ b/lib/vhost/virtio_net.c
>> @@ -3120,11 +3120,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>                                                  VHOST_ACCESS_RO) < 0))
>>                          break;
>>
>> -               c(vq, head_idx, 0);
>> -
>>                  if (unlikely(buf_len <= dev->vhost_hlen)) {
>> -                       dropped += 1;
>> -                       i++;
>> +                       dropped = 1;
>>                          break;
>>                  }
> 
> ``i`` was used for both filling the returned mbuf array, but also to
> update the shadow_used_idx / array.
> 
> So this change here also affects how the currently considered
> descriptor is returned through the used ring.
> 
> See below...
> 
>>
>> @@ -3143,8 +3140,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>                                          buf_len, mbuf_pool->name);
>>                                  allocerr_warned = true;
>>                          }
>> -                       dropped += 1;
>> -                       i++;
>> +                       dropped = 1;
>>                          break;
>>                  }
>>
>> @@ -3155,17 +3151,17 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>                                  VHOST_DATA_LOG(dev->ifname, ERR, "failed to copy desc to mbuf.");
>>                                  allocerr_warned = true;
>>                          }
>> -                       dropped += 1;
>> -                       i++;
>> +                       dropped = 1;
>>                          break;
>>                  }
>>
>> +               update_shadow_used_ring_split(vq, head_idx, 0);
>>          }
>>
>> -       if (dropped)
>> -               rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
>> +       if (unlikely(count != i))
>> +               rte_pktmbuf_free_bulk(&pkts[i], count - i);
>>
>> -       vq->last_avail_idx += i;
>> +       vq->last_avail_idx += i + dropped;
>>
>>          do_data_copy_dequeue(vq);
>>          if (unlikely(i < count))
> 
> ... I am copying the rest of the context:
>         if (unlikely(i < count))
>                 vq->shadow_used_idx = i;
> 
> Before the patch, when breaking and doing the i++ stuff,
> vq->shadow_used_idx was probably already equal to i because
> update_shadow_used_ring_split had been called earlier.
> Because of this, the "dropped" descriptor was part of the shadow_used
> array for returning it through the used ring.
> 
> With the patch, since we break without touching i, it means that the
> "dropped" descriptor is not returned anymore.
> 
> Fixing this issue could take the form of restoring the call to
> update_shadow_used_ring_split in the loop and adjust
> vq->shadow_used_idx to i + dropped.
> 
> But I think we can go one step further, by noting that
> vq->last_avail_idx is being incremented by the same "i + dropped"
> value.
> Meaning that we could simply rely on calls to
> update_shadow_used_ring_split and reuse vq->shadow_used_idx to adjust
> vq->last_avail_idx.
> By doing this, there is no need for a "dropped" variable, and no need
> for touching of vq->shadow_used_idx manually, which is probably better
> for robustness / easing readability.

I fully agree with your suggestion as discussed off-list.
On top of that, it also makes the split code closer to the packed one.

> 
> Note: we could also move the call do_data_copy_dequeue() under the
> check on vq->shadow_used_idx != 0, though it won't probably change
> anything.

I will move the call to do_data_copy_dequeue() under vq->shadow_used_idx
!= 0 check.

Thanks,
Maxime

> 
> This gives the following (untested) diff, on top of your fix:
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index 211d24b36a..4e1d61bd54 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -3104,7 +3104,6 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>   {
>          uint16_t i;
>          uint16_t avail_entries;
> -       uint16_t dropped = 0;
>          static bool allocerr_warned;
> 
>          /*
> @@ -3141,10 +3140,10 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>                                                  VHOST_ACCESS_RO) < 0))
>                          break;
> 
> -               if (unlikely(buf_len <= dev->vhost_hlen)) {
> -                       dropped = 1;
> +               update_shadow_used_ring_split(vq, head_idx, 0);
> +
> +               if (unlikely(buf_len <= dev->vhost_hlen))
>                          break;
> -               }
> 
>                  buf_len -= dev->vhost_hlen;
> 
> @@ -3161,7 +3160,6 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>                                          buf_len, mbuf_pool->name);
>                                  allocerr_warned = true;
>                          }
> -                       dropped = 1;
>                          break;
>                  }
> 
> @@ -3172,22 +3170,16 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>                                  VHOST_DATA_LOG(dev->ifname, ERR,
> "failed to copy desc to mbuf.");
>                                  allocerr_warned = true;
>                          }
> -                       dropped = 1;
>                          break;
>                  }
> -
> -               update_shadow_used_ring_split(vq, head_idx, 0);
>          }
> 
>          if (unlikely(count != i))
>                  rte_pktmbuf_free_bulk(&pkts[i], count - i);
> 
> -       vq->last_avail_idx += i + dropped;
> -
>          do_data_copy_dequeue(vq);
> -       if (unlikely(i < count))
> -               vq->shadow_used_idx = i;
>          if (likely(vq->shadow_used_idx)) {
> +               vq->last_avail_idx += vq->shadow_used_idx;
>                  flush_shadow_used_ring_split(dev, vq);
>                  vhost_vring_call_split(dev, vq);
>          }
> 
>
  

Patch

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index 280d4845f8..db9985c9b9 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -3120,11 +3120,8 @@  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 						VHOST_ACCESS_RO) < 0))
 			break;
 
-		update_shadow_used_ring_split(vq, head_idx, 0);
-
 		if (unlikely(buf_len <= dev->vhost_hlen)) {
-			dropped += 1;
-			i++;
+			dropped = 1;
 			break;
 		}
 
@@ -3143,8 +3140,7 @@  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					buf_len, mbuf_pool->name);
 				allocerr_warned = true;
 			}
-			dropped += 1;
-			i++;
+			dropped = 1;
 			break;
 		}
 
@@ -3155,17 +3151,17 @@  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				VHOST_DATA_LOG(dev->ifname, ERR, "failed to copy desc to mbuf.");
 				allocerr_warned = true;
 			}
-			dropped += 1;
-			i++;
+			dropped = 1;
 			break;
 		}
 
+		update_shadow_used_ring_split(vq, head_idx, 0);
 	}
 
-	if (dropped)
-		rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
+	if (unlikely(count != i))
+		rte_pktmbuf_free_bulk(&pkts[i], count - i);
 
-	vq->last_avail_idx += i;
+	vq->last_avail_idx += i + dropped;
 
 	do_data_copy_dequeue(vq);
 	if (unlikely(i < count))
@@ -3175,7 +3171,7 @@  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		vhost_vring_call_split(dev, vq);
 	}
 
-	return (i - dropped);
+	return i;
 }
 
 __rte_noinline