vhost: add device op to offload the interrupt kick

Message ID 167992139724.45323.17979512439014217881.stgit@ebuild.local (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost: add device op to offload the interrupt kick |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-abi-testing warning Testing issues
ci/iol-unit-testing success Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/intel-Functional success Functional PASS

Commit Message

Eelco Chaudron March 27, 2023, 12:51 p.m. UTC
  This patch adds an operation callback which gets called every time the
library wants to call eventfd_write(). This eventfd_write() call could
result in a system call, which could potentially block the PMD thread.

The callback function can decide whether it's ok to handle the
eventfd_write() now or have the newly introduced function,
rte_vhost_notify_guest(), called at a later time.

This can be used by 3rd party applications, like OVS, to avoid system
calls being called as part of the PMD threads.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/vhost/rte_vhost.h |   10 +++++++++-
 lib/vhost/vhost.c     |   21 +++++++++++++++++++++
 lib/vhost/vhost.h     |   43 ++++++++++++++++++++++++++++---------------
 3 files changed, 58 insertions(+), 16 deletions(-)
  

Comments

Maxime Coquelin March 27, 2023, 1:21 p.m. UTC | #1
Hi Eelco,

On 3/27/23 14:51, Eelco Chaudron wrote:
> This patch adds an operation callback which gets called every time the
> library wants to call eventfd_write(). This eventfd_write() call could
> result in a system call, which could potentially block the PMD thread.
> 
> The callback function can decide whether it's ok to handle the
> eventfd_write() now or have the newly introduced function,
> rte_vhost_notify_guest(), called at a later time.
> 
> This can be used by 3rd party applications, like OVS, to avoid system
> calls being called as part of the PMD threads.

That's a good idea, please find some comments inline:

> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>   lib/vhost/rte_vhost.h |   10 +++++++++-
>   lib/vhost/vhost.c     |   21 +++++++++++++++++++++
>   lib/vhost/vhost.h     |   43 ++++++++++++++++++++++++++++---------------
>   3 files changed, 58 insertions(+), 16 deletions(-)
> 
> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
> index 58a5d4be92..af7a394d0f 100644
> --- a/lib/vhost/rte_vhost.h
> +++ b/lib/vhost/rte_vhost.h
> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>   	 */
>   	void (*guest_notified)(int vid);
>   
> -	void *reserved[1]; /**< Reserved for future extension */
> +	/**
> +	 * If this callback is registered, notification to the guest can
> +	 * be handled by the front-end calling rte_vhost_notify_guest().
> +	 * If it's not handled, 'false' should be returned. This can be used
> +	 * to remove the "slow" eventfd_write() syscall from the datapath.
> +	 */
> +	bool (*guest_notify)(int vid, uint16_t queue_id);
>   };
>   
>   /**
> @@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>   
>   int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>   
> +void rte_vhost_notify_guest(int vid, uint16_t queue_id);

The new API needs to be tagged as experimental, and also documented. (I
see rte_vhost_enable_guest_notification is not properly documented, so
not a good example!)

> +
>   /**
>    * Register vhost driver. path could be different for multiple
>    * instance support.
> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
> index ef37943817..ee090d78ef 100644
> --- a/lib/vhost/vhost.c
> +++ b/lib/vhost/vhost.c
> @@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>   	return ret;
>   }
>   
> +void
> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
> +{
> +	struct virtio_net *dev = get_device(vid);
> +	struct vhost_virtqueue *vq;
> +
> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
> +		return;
> +
> +	vq = dev->virtqueue[queue_id];
> +	if (!vq)
> +		return;
> +
> +	rte_spinlock_lock(&vq->access_lock);
> +
> +	if (vq->callfd >= 0)
> +		eventfd_write(vq->callfd, (eventfd_t)1);

Maybe we should return an error of callfd is invalid or eventfd_write()
failed.

> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +}
> +
>   void
>   rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>   {
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 8fdab13c70..39ad8260a1 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>   	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>   }
>   
> +static __rte_always_inline void
> +vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +	if (dev->notify_ops->guest_notify) {
> +		uint16_t qid;
> +		for (qid = 0;  qid < dev->nr_vring; qid++) {
> +			if (dev->virtqueue[qid] == vq) {
> +				if (dev->notify_ops->guest_notify(dev->vid,
> +								  qid))
> +					goto done;
> +				break;
> +			}
> +		}

Since v22.11, you no more need to iterate through the port's virtqueues,
David introduced index field in vhost_virtuqueue in v22.11 (57e414e3ec29
("vhost: keep a reference to virtqueue index")).


> +	}
> +	eventfd_write(vq->callfd, (eventfd_t) 1);
> +
> +done:
> +	if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> +		vq->stats.guest_notifications++;
> +	if (dev->notify_ops->guest_notified)
> +		dev->notify_ops->guest_notified(dev->vid);
> +}

FYI, I have done almost the same refactoring in the VDUSE series I will
soon send:

https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/7089a8a6d1e89b9db90547eb5b4fef886f24fd5b

My version also introduces a new counter in case the notification
failed. Maybe we could also have a counter in case the notification has
been "offloaded" to the application?

> +
> +
>   static __rte_always_inline void
>   vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>   {
> @@ -905,21 +929,13 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>   		if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>   					(vq->callfd >= 0)) ||
>   				unlikely(!signalled_used_valid)) {
> -			eventfd_write(vq->callfd, (eventfd_t) 1);
> -			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -				vq->stats.guest_notifications++;
> -			if (dev->notify_ops->guest_notified)
> -				dev->notify_ops->guest_notified(dev->vid);
> +			vhost_vring_kick_guest(dev, vq);
>   		}
>   	} else {
>   		/* Kick the guest if necessary. */
>   		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>   				&& (vq->callfd >= 0)) {
> -			eventfd_write(vq->callfd, (eventfd_t)1);
> -			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
> -				vq->stats.guest_notifications++;
> -			if (dev->notify_ops->guest_notified)
> -				dev->notify_ops->guest_notified(dev->vid);
> +			vhost_vring_kick_guest(dev, vq);
>   		}
>   	}
>   }
> @@ -971,11 +987,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>   	if (vhost_need_event(off, new, old))
>   		kick = true;
>   kick:
> -	if (kick) {
> -		eventfd_write(vq->callfd, (eventfd_t)1);
> -		if (dev->notify_ops->guest_notified)
> -			dev->notify_ops->guest_notified(dev->vid);
> -	}
> +	if (kick)
> +		vhost_vring_kick_guest(dev, vq);
>   }
>   
>   static __rte_always_inline void
>
  
Eelco Chaudron March 27, 2023, 2:10 p.m. UTC | #2
On 27 Mar 2023, at 15:21, Maxime Coquelin wrote:

> Hi Eelco,
>
> On 3/27/23 14:51, Eelco Chaudron wrote:
>> This patch adds an operation callback which gets called every time the
>> library wants to call eventfd_write(). This eventfd_write() call could
>> result in a system call, which could potentially block the PMD thread.
>>
>> The callback function can decide whether it's ok to handle the
>> eventfd_write() now or have the newly introduced function,
>> rte_vhost_notify_guest(), called at a later time.
>>
>> This can be used by 3rd party applications, like OVS, to avoid system
>> calls being called as part of the PMD threads.
>
> That's a good idea, please find some comments inline:

Thanks for the review, see inline.
>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>   lib/vhost/rte_vhost.h |   10 +++++++++-
>>   lib/vhost/vhost.c     |   21 +++++++++++++++++++++
>>   lib/vhost/vhost.h     |   43 ++++++++++++++++++++++++++++---------------
>>   3 files changed, 58 insertions(+), 16 deletions(-)
>>
>> diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
>> index 58a5d4be92..af7a394d0f 100644
>> --- a/lib/vhost/rte_vhost.h
>> +++ b/lib/vhost/rte_vhost.h
>> @@ -298,7 +298,13 @@ struct rte_vhost_device_ops {
>>   	 */
>>   	void (*guest_notified)(int vid);
>>  -	void *reserved[1]; /**< Reserved for future extension */
>> +	/**
>> +	 * If this callback is registered, notification to the guest can
>> +	 * be handled by the front-end calling rte_vhost_notify_guest().
>> +	 * If it's not handled, 'false' should be returned. This can be used
>> +	 * to remove the "slow" eventfd_write() syscall from the datapath.
>> +	 */
>> +	bool (*guest_notify)(int vid, uint16_t queue_id);
>>   };
>>    /**
>> @@ -433,6 +439,8 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
>>    int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
>>  +void rte_vhost_notify_guest(int vid, uint16_t queue_id);
>
> The new API needs to be tagged as experimental, and also documented. (I
> see rte_vhost_enable_guest_notification is not properly documented, so
> not a good example!)

I used exactly that as an example, so thought it should be good ;)
Will add this in the next revision...

>> +
>>   /**
>>    * Register vhost driver. path could be different for multiple
>>    * instance support.
>> diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
>> index ef37943817..ee090d78ef 100644
>> --- a/lib/vhost/vhost.c
>> +++ b/lib/vhost/vhost.c
>> @@ -1467,6 +1467,27 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
>>   	return ret;
>>   }
>>  +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id)
>> +{
>> +	struct virtio_net *dev = get_device(vid);
>> +	struct vhost_virtqueue *vq;
>> +
>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +		return;
>> +
>> +	vq = dev->virtqueue[queue_id];
>> +	if (!vq)
>> +		return;
>> +
>> +	rte_spinlock_lock(&vq->access_lock);
>> +
>> +	if (vq->callfd >= 0)
>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>
> Maybe we should return an error of callfd is invalid or eventfd_write()
> failed.

See below

>> +
>> +	rte_spinlock_unlock(&vq->access_lock);
>> +}
>> +
>>   void
>>   rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
>>   {
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 8fdab13c70..39ad8260a1 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -883,6 +883,30 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
>>   	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
>>   }
>>  +static __rte_always_inline void
>> +vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +	if (dev->notify_ops->guest_notify) {
>> +		uint16_t qid;
>> +		for (qid = 0;  qid < dev->nr_vring; qid++) {
>> +			if (dev->virtqueue[qid] == vq) {
>> +				if (dev->notify_ops->guest_notify(dev->vid,
>> +								  qid))
>> +					goto done;
>> +				break;
>> +			}
>> +		}
>
> Since v22.11, you no more need to iterate through the port's virtqueues,
> David introduced index field in vhost_virtuqueue in v22.11 (57e414e3ec29
> ("vhost: keep a reference to virtqueue index")).

Thanks will change this, I did most of this a while back and did not re-check.

>> +	}
>> +	eventfd_write(vq->callfd, (eventfd_t) 1);
>> +
>> +done:
>> +	if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> +		vq->stats.guest_notifications++;
>> +	if (dev->notify_ops->guest_notified)
>> +		dev->notify_ops->guest_notified(dev->vid);
>> +}
>
> FYI, I have done almost the same refactoring in the VDUSE series I will
> soon send:
>
> https://gitlab.com/mcoquelin/dpdk-next-virtio/-/commit/7089a8a6d1e89b9db90547eb5b4fef886f24fd5b
>
> My version also introduces a new counter in case the notification
> failed. Maybe we could also have a counter in case the notification has
> been "offloaded" to the application?

What would be the best approach!? I can wait to send a new rev until your patch is out.

>> +
>> +
>>   static __rte_always_inline void
>>   vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   {
>> @@ -905,21 +929,13 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   		if ((vhost_need_event(vhost_used_event(vq), new, old) &&
>>   					(vq->callfd >= 0)) ||
>>   				unlikely(!signalled_used_valid)) {
>> -			eventfd_write(vq->callfd, (eventfd_t) 1);
>> -			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -				vq->stats.guest_notifications++;
>> -			if (dev->notify_ops->guest_notified)
>> -				dev->notify_ops->guest_notified(dev->vid);
>> +			vhost_vring_kick_guest(dev, vq);
>>   		}
>>   	} else {
>>   		/* Kick the guest if necessary. */
>>   		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>>   				&& (vq->callfd >= 0)) {
>> -			eventfd_write(vq->callfd, (eventfd_t)1);
>> -			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
>> -				vq->stats.guest_notifications++;
>> -			if (dev->notify_ops->guest_notified)
>> -				dev->notify_ops->guest_notified(dev->vid);
>> +			vhost_vring_kick_guest(dev, vq);
>>   		}
>>   	}
>>   }
>> @@ -971,11 +987,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   	if (vhost_need_event(off, new, old))
>>   		kick = true;
>>   kick:
>> -	if (kick) {
>> -		eventfd_write(vq->callfd, (eventfd_t)1);
>> -		if (dev->notify_ops->guest_notified)
>> -			dev->notify_ops->guest_notified(dev->vid);
>> -	}
>> +	if (kick)
>> +		vhost_vring_kick_guest(dev, vq);
>>   }
>>    static __rte_always_inline void
>>
  
Gowrishankar Muthukrishnan March 27, 2023, 3:16 p.m. UTC | #3
Hi Eelco,

> +void
> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
> +	struct virtio_net *dev = get_device(vid);
> +	struct vhost_virtqueue *vq;
> +
> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
> +		return;
> +
> +	vq = dev->virtqueue[queue_id];
> +	if (!vq)
> +		return;
> +
> +	rte_spinlock_lock(&vq->access_lock);
> +

Is spin lock needed here before system call ?

> +	if (vq->callfd >= 0)
> +		eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +}
> +

Thanks.
  
Eelco Chaudron March 27, 2023, 4:04 p.m. UTC | #4
On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:

> Hi Eelco,
>
>> +void
>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>> +	struct virtio_net *dev = get_device(vid);
>> +	struct vhost_virtqueue *vq;
>> +
>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>> +		return;
>> +
>> +	vq = dev->virtqueue[queue_id];
>> +	if (!vq)
>> +		return;
>> +
>> +	rte_spinlock_lock(&vq->access_lock);
>> +
>
> Is spin lock needed here before system call ?

I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock.

>> +	if (vq->callfd >= 0)
>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>> +
>> +	rte_spinlock_unlock(&vq->access_lock);
>> +}
>> +
>
> Thanks.
  
Maxime Coquelin March 27, 2023, 4:35 p.m. UTC | #5
On 3/27/23 18:04, Eelco Chaudron wrote:
> 
> 
> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
> 
>> Hi Eelco,
>>
>>> +void
>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>> +	struct virtio_net *dev = get_device(vid);
>>> +	struct vhost_virtqueue *vq;
>>> +
>>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>> +		return;
>>> +
>>> +	vq = dev->virtqueue[queue_id];
>>> +	if (!vq)
>>> +		return;
>>> +
>>> +	rte_spinlock_lock(&vq->access_lock);
>>> +
>>
>> Is spin lock needed here before system call ?
> 
> I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock.

The FD might be closed between the check and the call to eventfd_write
though, but I agree this is not optimal to call the eventfd_write under
the spinlock in your case, as you will block the pmd thread if it tries
to enqueue/dequeue packets on this queue, defeating the purpose of this
patch.

Maybe the solution is to change to read-write locks for the access_lock
spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
and this API would use the read version, meaning they won't lock each
other, and the control path (lib/vhost/vhost_user.c) will use the write
version.

Does that make sense?

Maxime
> 
>>> +	if (vq->callfd >= 0)
>>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>>> +
>>> +	rte_spinlock_unlock(&vq->access_lock);
>>> +}
>>> +
>>
>> Thanks.
>
  
Eelco Chaudron March 28, 2023, 12:14 p.m. UTC | #6
On 27 Mar 2023, at 18:35, Maxime Coquelin wrote:

> On 3/27/23 18:04, Eelco Chaudron wrote:
>>
>>
>> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
>>
>>> Hi Eelco,
>>>
>>>> +void
>>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>>> +	struct virtio_net *dev = get_device(vid);
>>>> +	struct vhost_virtqueue *vq;
>>>> +
>>>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>>> +		return;
>>>> +
>>>> +	vq = dev->virtqueue[queue_id];
>>>> +	if (!vq)
>>>> +		return;
>>>> +
>>>> +	rte_spinlock_lock(&vq->access_lock);
>>>> +
>>>
>>> Is spin lock needed here before system call ?
>>
>> I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock.
>
> The FD might be closed between the check and the call to eventfd_write
> though, but I agree this is not optimal to call the eventfd_write under
> the spinlock in your case, as you will block the pmd thread if it tries
> to enqueue/dequeue packets on this queue, defeating the purpose of this
> patch.
>
> Maybe the solution is to change to read-write locks for the access_lock
> spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
> and this API would use the read version, meaning they won't lock each
> other, and the control path (lib/vhost/vhost_user.c) will use the write
> version.
>
> Does that make sense?

Yes, this makes sense, let me investigate this and get back.

>>>> +	if (vq->callfd >= 0)
>>>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>>>> +
>>>> +	rte_spinlock_unlock(&vq->access_lock);
>>>> +}
>>>> +
>>>
>>> Thanks.
>>
  
Eelco Chaudron April 3, 2023, 2:51 p.m. UTC | #7
On 27 Mar 2023, at 18:35, Maxime Coquelin wrote:

> On 3/27/23 18:04, Eelco Chaudron wrote:
>>
>>
>> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
>>
>>> Hi Eelco,
>>>
>>>> +void
>>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>>> +	struct virtio_net *dev = get_device(vid);
>>>> +	struct vhost_virtqueue *vq;
>>>> +
>>>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>>> +		return;
>>>> +
>>>> +	vq = dev->virtqueue[queue_id];
>>>> +	if (!vq)
>>>> +		return;
>>>> +
>>>> +	rte_spinlock_lock(&vq->access_lock);
>>>> +
>>>
>>> Is spin lock needed here before system call ?
>>
>> I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock.
>
> The FD might be closed between the check and the call to eventfd_write
> though, but I agree this is not optimal to call the eventfd_write under
> the spinlock in your case, as you will block the pmd thread if it tries
> to enqueue/dequeue packets on this queue, defeating the purpose of this
> patch.
>
> Maybe the solution is to change to read-write locks for the access_lock
> spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
> and this API would use the read version, meaning they won't lock each
> other, and the control path (lib/vhost/vhost_user.c) will use the write
> version.
>
> Does that make sense?

Hi Maxime, I prepped a patch, but not the read/write part yet, https://github.com/chaudron/dpdk/commit/d51c93b4ff08b43daed33e3c0fee193a6d039c25#.

I was thinking that even a read/write lock does not work (or maybe we need a combination of taking the read and write lock). As we need to update statistics, which need protection.
For example here you see the split (with just two locks, but the sys call could be called in the read lock):

https://github.com/chaudron/dpdk/blob/d51c93b4ff08b43daed33e3c0fee193a6d039c25/lib/vhost/vhost.c#L1493

The best would be to not have a lock when calling the system call, but that does not seem safe. I do not see a clear solution, guess also as I have some trouble understanding the lock rules around vq’s.

Do you have some more insights to share? I can ping you offline and discuss this.

Cheers,

Eelco

>>
>>>> +	if (vq->callfd >= 0)
>>>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>>>> +
>>>> +	rte_spinlock_unlock(&vq->access_lock);
>>>> +}
>>>> +
>>>
>>> Thanks.
>>
  
Maxime Coquelin April 3, 2023, 3:26 p.m. UTC | #8
Hi Eelco,

On 4/3/23 16:51, Eelco Chaudron wrote:
> 
> 
> On 27 Mar 2023, at 18:35, Maxime Coquelin wrote:
> 
>> On 3/27/23 18:04, Eelco Chaudron wrote:
>>>
>>>
>>> On 27 Mar 2023, at 17:16, Gowrishankar Muthukrishnan wrote:
>>>
>>>> Hi Eelco,
>>>>
>>>>> +void
>>>>> +rte_vhost_notify_guest(int vid, uint16_t queue_id) {
>>>>> +	struct virtio_net *dev = get_device(vid);
>>>>> +	struct vhost_virtqueue *vq;
>>>>> +
>>>>> +	if (!dev ||  queue_id >= VHOST_MAX_VRING)
>>>>> +		return;
>>>>> +
>>>>> +	vq = dev->virtqueue[queue_id];
>>>>> +	if (!vq)
>>>>> +		return;
>>>>> +
>>>>> +	rte_spinlock_lock(&vq->access_lock);
>>>>> +
>>>>
>>>> Is spin lock needed here before system call ?
>>>
>>> I assumed access_lock is protecting all the following fields in this structure, so I need the lock to read the vq->callfd, however, I can/should move the eventfd_write outside of the lock.
>>
>> The FD might be closed between the check and the call to eventfd_write
>> though, but I agree this is not optimal to call the eventfd_write under
>> the spinlock in your case, as you will block the pmd thread if it tries
>> to enqueue/dequeue packets on this queue, defeating the purpose of this
>> patch.
>>
>> Maybe the solution is to change to read-write locks for the access_lock
>> spinlock. The datapath (rte_vhost_enqueue_burst/rte_vhost_dequeue_burst)
>> and this API would use the read version, meaning they won't lock each
>> other, and the control path (lib/vhost/vhost_user.c) will use the write
>> version.
>>
>> Does that make sense?
> 
> Hi Maxime, I prepped a patch, but not the read/write part yet, https://github.com/chaudron/dpdk/commit/d51c93b4ff08b43daed33e3c0fee193a6d039c25#.
> 
> I was thinking that even a read/write lock does not work (or maybe we need a combination of taking the read and write lock). As we need to update statistics, which need protection.
> For example here you see the split (with just two locks, but the sys call could be called in the read lock):
> 
> https://github.com/chaudron/dpdk/blob/d51c93b4ff08b43daed33e3c0fee193a6d039c25/lib/vhost/vhost.c#L1493
> 
> The best would be to not have a lock when calling the system call, but that does not seem safe. I do not see a clear solution, guess also as I have some trouble understanding the lock rules around vq’s.

The lock is indeed required. Maybe we can use read-lock, and use atomic
operations for counters that could be accessed by several threads?

> 
> Do you have some more insights to share? I can ping you offline and discuss this.

Sure, I'll be happy to discuss more about this.

Thanks,
Maxime

> Cheers,
> 
> Eelco
> 
>>>
>>>>> +	if (vq->callfd >= 0)
>>>>> +		eventfd_write(vq->callfd, (eventfd_t)1);
>>>>> +
>>>>> +	rte_spinlock_unlock(&vq->access_lock);
>>>>> +}
>>>>> +
>>>>
>>>> Thanks.
>>>
>
  

Patch

diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index 58a5d4be92..af7a394d0f 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -298,7 +298,13 @@  struct rte_vhost_device_ops {
 	 */
 	void (*guest_notified)(int vid);
 
-	void *reserved[1]; /**< Reserved for future extension */
+	/**
+	 * If this callback is registered, notification to the guest can
+	 * be handled by the front-end calling rte_vhost_notify_guest().
+	 * If it's not handled, 'false' should be returned. This can be used
+	 * to remove the "slow" eventfd_write() syscall from the datapath.
+	 */
+	bool (*guest_notify)(int vid, uint16_t queue_id);
 };
 
 /**
@@ -433,6 +439,8 @@  void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
 
+void rte_vhost_notify_guest(int vid, uint16_t queue_id);
+
 /**
  * Register vhost driver. path could be different for multiple
  * instance support.
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index ef37943817..ee090d78ef 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -1467,6 +1467,27 @@  rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
 	return ret;
 }
 
+void
+rte_vhost_notify_guest(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev = get_device(vid);
+	struct vhost_virtqueue *vq;
+
+	if (!dev ||  queue_id >= VHOST_MAX_VRING)
+		return;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq)
+		return;
+
+	rte_spinlock_lock(&vq->access_lock);
+
+	if (vq->callfd >= 0)
+		eventfd_write(vq->callfd, (eventfd_t)1);
+
+	rte_spinlock_unlock(&vq->access_lock);
+}
+
 void
 rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
 {
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 8fdab13c70..39ad8260a1 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -883,6 +883,30 @@  vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
 	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
 }
 
+static __rte_always_inline void
+vhost_vring_kick_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
+	if (dev->notify_ops->guest_notify) {
+		uint16_t qid;
+		for (qid = 0;  qid < dev->nr_vring; qid++) {
+			if (dev->virtqueue[qid] == vq) {
+				if (dev->notify_ops->guest_notify(dev->vid,
+								  qid))
+					goto done;
+				break;
+			}
+		}
+	}
+	eventfd_write(vq->callfd, (eventfd_t) 1);
+
+done:
+	if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+		vq->stats.guest_notifications++;
+	if (dev->notify_ops->guest_notified)
+		dev->notify_ops->guest_notified(dev->vid);
+}
+
+
 static __rte_always_inline void
 vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 {
@@ -905,21 +929,13 @@  vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
 		if ((vhost_need_event(vhost_used_event(vq), new, old) &&
 					(vq->callfd >= 0)) ||
 				unlikely(!signalled_used_valid)) {
-			eventfd_write(vq->callfd, (eventfd_t) 1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				vq->stats.guest_notifications++;
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+			vhost_vring_kick_guest(dev, vq);
 		}
 	} else {
 		/* Kick the guest if necessary. */
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 				&& (vq->callfd >= 0)) {
-			eventfd_write(vq->callfd, (eventfd_t)1);
-			if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
-				vq->stats.guest_notifications++;
-			if (dev->notify_ops->guest_notified)
-				dev->notify_ops->guest_notified(dev->vid);
+			vhost_vring_kick_guest(dev, vq);
 		}
 	}
 }
@@ -971,11 +987,8 @@  vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq)
 	if (vhost_need_event(off, new, old))
 		kick = true;
 kick:
-	if (kick) {
-		eventfd_write(vq->callfd, (eventfd_t)1);
-		if (dev->notify_ops->guest_notified)
-			dev->notify_ops->guest_notified(dev->vid);
-	}
+	if (kick)
+		vhost_vring_kick_guest(dev, vq);
 }
 
 static __rte_always_inline void