[dpdk-dev,2/4] vhost: introduce vhost_log_write
Commit Message
Introduce vhost_log_write() helper function to log the dirty pages we
touched. Page size is harded code to 4096 (VHOST_LOG_PAGE), and each
log is presented by 1 bit.
Therefore, vhost_log_write() simply finds the right bit for related
page we are gonna change, and set it to 1. dev->log_base denotes the
start of the dirty page bitmap.
The page address is biased by log_guest_addr, which is derived from
SET_VRING_ADDR request as part of the vring related addresses.
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
lib/librte_vhost/rte_virtio_net.h | 34 ++++++++++++++++++++++++++++++++++
lib/librte_vhost/virtio-net.c | 4 ++++
2 files changed, 38 insertions(+)
Comments
On Wed, Dec 02, 2015 at 11:43:11AM +0800, Yuanhan Liu wrote:
> Introduce vhost_log_write() helper function to log the dirty pages we
> touched. Page size is harded code to 4096 (VHOST_LOG_PAGE), and each
> log is presented by 1 bit.
>
> Therefore, vhost_log_write() simply finds the right bit for related
> page we are gonna change, and set it to 1. dev->log_base denotes the
> start of the dirty page bitmap.
>
> The page address is biased by log_guest_addr, which is derived from
> SET_VRING_ADDR request as part of the vring related addresses.
>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
> lib/librte_vhost/rte_virtio_net.h | 34 ++++++++++++++++++++++++++++++++++
> lib/librte_vhost/virtio-net.c | 4 ++++
> 2 files changed, 38 insertions(+)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 416dac2..191c1be 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -40,6 +40,7 @@
> */
>
> #include <stdint.h>
> +#include <linux/vhost.h>
> #include <linux/virtio_ring.h>
> #include <linux/virtio_net.h>
> #include <sys/eventfd.h>
> @@ -59,6 +60,8 @@ struct rte_mbuf;
> /* Backend value set by guest. */
> #define VIRTIO_DEV_STOPPED -1
>
> +#define VHOST_LOG_PAGE 4096
> +
>
> /* Enum for virtqueue management. */
> enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> @@ -82,6 +85,7 @@ struct vhost_virtqueue {
> struct vring_desc *desc; /**< Virtqueue descriptor ring. */
> struct vring_avail *avail; /**< Virtqueue available ring. */
> struct vring_used *used; /**< Virtqueue used ring. */
> + uint64_t log_guest_addr; /**< Physical address of used ring, for logging */
> uint32_t size; /**< Size of descriptor ring. */
> uint32_t backend; /**< Backend value to determine if device should started/stopped. */
> uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */
> @@ -203,6 +207,36 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
> return vhost_va;
> }
>
> +static inline void __attribute__((always_inline))
> +vhost_log_page(uint8_t *log_base, uint64_t page)
> +{
> + /* TODO: to make it atomic? */
> + log_base[page / 8] |= 1 << (page % 8);
I think the atomic OR operation is necessary only if there can be
more than one vhost-user back-end updating the guest's memory
simultaneously. However probably it is pretty safe to perform
regular OR operation, since rings are not shared between
back-end. What about buffers pointed by descriptors? To be on
the safe side, I would use a GCC built-in function
__sync_fetch_and_or().
> +}
> +
> +static inline void __attribute__((always_inline))
> +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> + uint64_t offset, uint64_t len)
> +{
> + uint64_t addr = vq->log_guest_addr;
> + uint64_t page;
> +
> + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> + !dev->log_base || !len))
> + return;
Isn't "likely" more appropriate in above, since the whole
expression is expected to be true most of the time?
> +
> + addr += offset;
> + if (dev->log_size < ((addr + len - 1) / VHOST_LOG_PAGE / 8))
> + return;
> +
> + page = addr / VHOST_LOG_PAGE;
> + while (page * VHOST_LOG_PAGE < addr + len) {
> + vhost_log_page(dev->log_base, page);
> + page += VHOST_LOG_PAGE;
> + }
> +}
> +
> +
> /**
> * Disable features in feature_mask. Returns 0 on success.
> */
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index 8364938..4481827 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -666,12 +666,16 @@ set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr)
> return -1;
> }
>
> + vq->log_guest_addr = addr->log_guest_addr;
> +
> LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address desc: %p\n",
> dev->device_fh, vq->desc);
> LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address avail: %p\n",
> dev->device_fh, vq->avail);
> LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address used: %p\n",
> dev->device_fh, vq->used);
> + LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") log_guest_addr: %p\n",
> + dev->device_fh, (void *)(uintptr_t)vq->log_guest_addr);
>
> return 0;
> }
> --
> 1.9.0
On Wed, Dec 02, 2015 at 03:53:01PM +0200, Victor Kaplansky wrote:
> On Wed, Dec 02, 2015 at 11:43:11AM +0800, Yuanhan Liu wrote:
> > Introduce vhost_log_write() helper function to log the dirty pages we
> > touched. Page size is harded code to 4096 (VHOST_LOG_PAGE), and each
> > log is presented by 1 bit.
> >
> > Therefore, vhost_log_write() simply finds the right bit for related
> > page we are gonna change, and set it to 1. dev->log_base denotes the
> > start of the dirty page bitmap.
> >
> > The page address is biased by log_guest_addr, which is derived from
> > SET_VRING_ADDR request as part of the vring related addresses.
> >
> > Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > ---
> > lib/librte_vhost/rte_virtio_net.h | 34 ++++++++++++++++++++++++++++++++++
> > lib/librte_vhost/virtio-net.c | 4 ++++
> > 2 files changed, 38 insertions(+)
> >
> > diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> > index 416dac2..191c1be 100644
> > --- a/lib/librte_vhost/rte_virtio_net.h
> > +++ b/lib/librte_vhost/rte_virtio_net.h
> > @@ -40,6 +40,7 @@
> > */
> >
> > #include <stdint.h>
> > +#include <linux/vhost.h>
> > #include <linux/virtio_ring.h>
> > #include <linux/virtio_net.h>
> > #include <sys/eventfd.h>
> > @@ -59,6 +60,8 @@ struct rte_mbuf;
> > /* Backend value set by guest. */
> > #define VIRTIO_DEV_STOPPED -1
> >
> > +#define VHOST_LOG_PAGE 4096
> > +
> >
> > /* Enum for virtqueue management. */
> > enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> > @@ -82,6 +85,7 @@ struct vhost_virtqueue {
> > struct vring_desc *desc; /**< Virtqueue descriptor ring. */
> > struct vring_avail *avail; /**< Virtqueue available ring. */
> > struct vring_used *used; /**< Virtqueue used ring. */
> > + uint64_t log_guest_addr; /**< Physical address of used ring, for logging */
> > uint32_t size; /**< Size of descriptor ring. */
> > uint32_t backend; /**< Backend value to determine if device should started/stopped. */
> > uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */
> > @@ -203,6 +207,36 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
> > return vhost_va;
> > }
> >
> > +static inline void __attribute__((always_inline))
> > +vhost_log_page(uint8_t *log_base, uint64_t page)
> > +{
> > + /* TODO: to make it atomic? */
> > + log_base[page / 8] |= 1 << (page % 8);
>
> I think the atomic OR operation is necessary only if there can be
> more than one vhost-user back-end updating the guest's memory
> simultaneously. However probably it is pretty safe to perform
> regular OR operation, since rings are not shared between
> back-end. What about buffers pointed by descriptors? To be on
> the safe side, I would use a GCC built-in function
> __sync_fetch_and_or().
The build has to be passed not only for gcc, but for icc and clang as
well.
>
> > +}
> > +
> > +static inline void __attribute__((always_inline))
> > +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > + uint64_t offset, uint64_t len)
> > +{
> > + uint64_t addr = vq->log_guest_addr;
> > + uint64_t page;
> > +
> > + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> > + !dev->log_base || !len))
> > + return;
>
> Isn't "likely" more appropriate in above, since the whole
> expression is expected to be true most of the time?
Sorry, it's a typo, and thanks for the catching.
--yliu
On 12/2/2015 9:53 PM, Victor Kaplansky wrote:
> On Wed, Dec 02, 2015 at 11:43:11AM +0800, Yuanhan Liu wrote:
>> Introduce vhost_log_write() helper function to log the dirty pages we
>> touched. Page size is harded code to 4096 (VHOST_LOG_PAGE), and each
>> log is presented by 1 bit.
>>
>> Therefore, vhost_log_write() simply finds the right bit for related
>> page we are gonna change, and set it to 1. dev->log_base denotes the
>> start of the dirty page bitmap.
>>
>> The page address is biased by log_guest_addr, which is derived from
>> SET_VRING_ADDR request as part of the vring related addresses.
>>
>> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> ---
>> lib/librte_vhost/rte_virtio_net.h | 34 ++++++++++++++++++++++++++++++++++
>> lib/librte_vhost/virtio-net.c | 4 ++++
>> 2 files changed, 38 insertions(+)
>>
>> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
>> index 416dac2..191c1be 100644
>> --- a/lib/librte_vhost/rte_virtio_net.h
>> +++ b/lib/librte_vhost/rte_virtio_net.h
>> @@ -40,6 +40,7 @@
>> */
>>
>> #include <stdint.h>
>> +#include <linux/vhost.h>
>> #include <linux/virtio_ring.h>
>> #include <linux/virtio_net.h>
>> #include <sys/eventfd.h>
>> @@ -59,6 +60,8 @@ struct rte_mbuf;
>> /* Backend value set by guest. */
>> #define VIRTIO_DEV_STOPPED -1
>>
>> +#define VHOST_LOG_PAGE 4096
>> +
>>
>> /* Enum for virtqueue management. */
>> enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
>> @@ -82,6 +85,7 @@ struct vhost_virtqueue {
>> struct vring_desc *desc; /**< Virtqueue descriptor ring. */
>> struct vring_avail *avail; /**< Virtqueue available ring. */
>> struct vring_used *used; /**< Virtqueue used ring. */
>> + uint64_t log_guest_addr; /**< Physical address of used ring, for logging */
>> uint32_t size; /**< Size of descriptor ring. */
>> uint32_t backend; /**< Backend value to determine if device should started/stopped. */
>> uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */
>> @@ -203,6 +207,36 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
>> return vhost_va;
>> }
>>
>> +static inline void __attribute__((always_inline))
>> +vhost_log_page(uint8_t *log_base, uint64_t page)
>> +{
>> + /* TODO: to make it atomic? */
>> + log_base[page / 8] |= 1 << (page % 8);
> I think the atomic OR operation is necessary only if there can be
> more than one vhost-user back-end updating the guest's memory
> simultaneously. However probably it is pretty safe to perform
> regular OR operation, since rings are not shared between
> back-end. What about buffers pointed by descriptors? To be on
> the safe side, I would use a GCC built-in function
> __sync_fetch_and_or().
>
>> +}
>> +
>> +static inline void __attribute__((always_inline))
>> +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> + uint64_t offset, uint64_t len)
>> +{
>> + uint64_t addr = vq->log_guest_addr;
>> + uint64_t page;
>> +
>> + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> + !dev->log_base || !len))
>> + return;
> Isn't "likely" more appropriate in above, since the whole
> expression is expected to be true most of the time?
Victor:
So we are not always logging, what is the message that tells the backend
the migration is started?
[...]
On Wed, Dec 09, 2015 at 03:33:16AM +0000, Xie, Huawei wrote:
...
> >> +static inline void __attribute__((always_inline))
> >> +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >> + uint64_t offset, uint64_t len)
> >> +{
> >> + uint64_t addr = vq->log_guest_addr;
> >> + uint64_t page;
> >> +
> >> + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> >> + !dev->log_base || !len))
> >> + return;
> > Isn't "likely" more appropriate in above, since the whole
> > expression is expected to be true most of the time?
> Victor:
> So we are not always logging, what is the message that tells the backend
> the migration is started?
When log starts, VHOST_USER_SET_FEATURES request will be sent again,
with VHOST_F_LOG_ALL feature bit set.
--yliu
On 12/9/2015 11:41 AM, Yuanhan Liu wrote:
> On Wed, Dec 09, 2015 at 03:33:16AM +0000, Xie, Huawei wrote:
> ...
>>>> +static inline void __attribute__((always_inline))
>>>> +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>> + uint64_t offset, uint64_t len)
>>>> +{
>>>> + uint64_t addr = vq->log_guest_addr;
>>>> + uint64_t page;
>>>> +
>>>> + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>>>> + !dev->log_base || !len))
>>>> + return;
>>> Isn't "likely" more appropriate in above, since the whole
>>> expression is expected to be true most of the time?
>> Victor:
>> So we are not always logging, what is the message that tells the backend
>> the migration is started?
> When log starts, VHOST_USER_SET_FEATURES request will be sent again,
> with VHOST_F_LOG_ALL feature bit set.
As the VHOST_USER_SET_FEATURES handling and rx/tx runs asynchronously,
we have to make sure we don't miss logging anything when this feature is
set. For example, I doubt like in virtio_dev_rx, is the dev->features
volatile?
> --yliu
>
On Wed, Dec 09, 2015 at 05:44:11AM +0000, Xie, Huawei wrote:
> On 12/9/2015 11:41 AM, Yuanhan Liu wrote:
> > On Wed, Dec 09, 2015 at 03:33:16AM +0000, Xie, Huawei wrote:
> > ...
> >>>> +static inline void __attribute__((always_inline))
> >>>> +vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >>>> + uint64_t offset, uint64_t len)
> >>>> +{
> >>>> + uint64_t addr = vq->log_guest_addr;
> >>>> + uint64_t page;
> >>>> +
> >>>> + if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> >>>> + !dev->log_base || !len))
> >>>> + return;
> >>> Isn't "likely" more appropriate in above, since the whole
> >>> expression is expected to be true most of the time?
> >> Victor:
> >> So we are not always logging, what is the message that tells the backend
> >> the migration is started?
> > When log starts, VHOST_USER_SET_FEATURES request will be sent again,
> > with VHOST_F_LOG_ALL feature bit set.
> As the VHOST_USER_SET_FEATURES handling and rx/tx runs asynchronously,
> we have to make sure we don't miss logging anything when this feature is
> set.
That's a good remind. Thanks.
> For example, I doubt like in virtio_dev_rx, is the dev->features
> volatile?
No, it is not volatile.
--yliu
@@ -40,6 +40,7 @@
*/
#include <stdint.h>
+#include <linux/vhost.h>
#include <linux/virtio_ring.h>
#include <linux/virtio_net.h>
#include <sys/eventfd.h>
@@ -59,6 +60,8 @@ struct rte_mbuf;
/* Backend value set by guest. */
#define VIRTIO_DEV_STOPPED -1
+#define VHOST_LOG_PAGE 4096
+
/* Enum for virtqueue management. */
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
@@ -82,6 +85,7 @@ struct vhost_virtqueue {
struct vring_desc *desc; /**< Virtqueue descriptor ring. */
struct vring_avail *avail; /**< Virtqueue available ring. */
struct vring_used *used; /**< Virtqueue used ring. */
+ uint64_t log_guest_addr; /**< Physical address of used ring, for logging */
uint32_t size; /**< Size of descriptor ring. */
uint32_t backend; /**< Backend value to determine if device should started/stopped. */
uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */
@@ -203,6 +207,36 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
return vhost_va;
}
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+ /* TODO: to make it atomic? */
+ log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ uint64_t addr = vq->log_guest_addr;
+ uint64_t page;
+
+ if (unlikely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ addr += offset;
+ if (dev->log_size < ((addr + len - 1) / VHOST_LOG_PAGE / 8))
+ return;
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_page(dev->log_base, page);
+ page += VHOST_LOG_PAGE;
+ }
+}
+
+
/**
* Disable features in feature_mask. Returns 0 on success.
*/
@@ -666,12 +666,16 @@ set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr *addr)
return -1;
}
+ vq->log_guest_addr = addr->log_guest_addr;
+
LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address desc: %p\n",
dev->device_fh, vq->desc);
LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address avail: %p\n",
dev->device_fh, vq->avail);
LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") mapped address used: %p\n",
dev->device_fh, vq->used);
+ LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") log_guest_addr: %p\n",
+ dev->device_fh, (void *)(uintptr_t)vq->log_guest_addr);
return 0;
}