[v6,06/16] vdpa/ifc: add block device SW live-migration

Message ID 1650530034-59744-7-git-send-email-andy.pei@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series add virtio_blk device support to vdpa/ifc |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Pei, Andy April 21, 2022, 8:33 a.m. UTC
  Add SW live-migration support to block device.
Add dirty page logging to block device.

Signed-off-by: Andy Pei <andy.pei@intel.com>
---
 drivers/vdpa/ifc/base/ifcvf.c |   4 +-
 drivers/vdpa/ifc/base/ifcvf.h |   6 ++
 drivers/vdpa/ifc/ifcvf_vdpa.c | 128 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 115 insertions(+), 23 deletions(-)
  

Comments

Chenbo Xia April 25, 2022, 1:10 p.m. UTC | #1
> -----Original Message-----
> From: Pei, Andy <andy.pei@intel.com>
> Sent: Thursday, April 21, 2022 4:34 PM
> To: dev@dpdk.org
> Cc: Xia, Chenbo <chenbo.xia@intel.com>; maxime.coquelin@redhat.com; Cao,
> Gang <gang.cao@intel.com>; Liu, Changpeng <changpeng.liu@intel.com>
> Subject: [PATCH v6 06/16] vdpa/ifc: add block device SW live-migration
> 
> Add SW live-migration support to block device.
> Add dirty page logging to block device.

Add SW live-migration support including dirty page logging for block device.

> 
> Signed-off-by: Andy Pei <andy.pei@intel.com>
> ---
>  drivers/vdpa/ifc/base/ifcvf.c |   4 +-
>  drivers/vdpa/ifc/base/ifcvf.h |   6 ++
>  drivers/vdpa/ifc/ifcvf_vdpa.c | 128 +++++++++++++++++++++++++++++++++++--
> -----
>  3 files changed, 115 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c
> index d10c1fd..e417c50 100644
> --- a/drivers/vdpa/ifc/base/ifcvf.c
> +++ b/drivers/vdpa/ifc/base/ifcvf.c
> @@ -191,7 +191,7 @@
>  	IFCVF_WRITE_REG32(val >> 32, hi);
>  }
> 
> -STATIC int
> +int
>  ifcvf_hw_enable(struct ifcvf_hw *hw)
>  {
>  	struct ifcvf_pci_common_cfg *cfg;
> @@ -240,7 +240,7 @@
>  	return 0;
>  }
> 
> -STATIC void
> +void
>  ifcvf_hw_disable(struct ifcvf_hw *hw)
>  {
>  	u32 i;
> diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
> index 769c603..6dd7925 100644
> --- a/drivers/vdpa/ifc/base/ifcvf.h
> +++ b/drivers/vdpa/ifc/base/ifcvf.h
> @@ -179,4 +179,10 @@ struct ifcvf_hw {
>  u64
>  ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
> 
> +int
> +ifcvf_hw_enable(struct ifcvf_hw *hw);
> +
> +void
> +ifcvf_hw_disable(struct ifcvf_hw *hw);
> +
>  #endif /* _IFCVF_H_ */
> diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
> index 8d104b7..a23dc2d 100644
> --- a/drivers/vdpa/ifc/ifcvf_vdpa.c
> +++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
> @@ -345,6 +345,56 @@ struct rte_vdpa_dev_info {
>  	}
>  }
> 
> +static void
> +vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal)
> +{
> +	struct ifcvf_hw *hw = &internal->hw;
> +	struct rte_vhost_vring vq;
> +	int i, vid;
> +	uint64_t features = 0;
> +	uint64_t log_base = 0, log_size = 0;
> +	uint64_t len;
> +
> +	vid = internal->vid;
> +
> +	if (internal->device_type == IFCVF_BLK) {
> +		for (i = 0; i < hw->nr_vring; i++) {
> +			rte_vhost_get_vhost_vring(internal->vid, i, &vq);
> +			while (vq.avail->idx != vq.used->idx) {
> +				ifcvf_notify_queue(hw, i);
> +				usleep(10);
> +			}
> +			hw->vring[i].last_avail_idx = vq.avail->idx;
> +			hw->vring[i].last_used_idx = vq.used->idx;
> +		}
> +	}
> +
> +	ifcvf_hw_disable(hw);
> +
> +	for (i = 0; i < hw->nr_vring; i++)
> +		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
> +				hw->vring[i].last_used_idx);
> +
> +	if (internal->sw_lm)
> +		return;
> +
> +	rte_vhost_get_negotiated_features(vid, &features);
> +	if (RTE_VHOST_NEED_LOG(features)) {
> +		ifcvf_disable_logging(hw);
> +		rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
> +		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
> +				log_base, IFCVF_LOG_BASE, log_size);
> +		/*
> +		 * IFCVF marks dirty memory pages for only packet buffer,
> +		 * SW helps to mark the used ring as dirty after device stops.
> +		 */
> +		for (i = 0; i < hw->nr_vring; i++) {
> +			len = IFCVF_USED_RING_LEN(hw->vring[i].size);
> +			rte_vhost_log_used_vring(vid, i, 0, len);
> +		}
> +	}
> +}

Can we consider combining vdpa_ifcvf_blk_pause and vdpa_ifcvf_stop to one function and
check device type internally to do different things? Because as I see, most logic is 
the same.

> +
>  #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
>  		sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
>  static int
> @@ -659,15 +709,22 @@ struct rte_vdpa_dev_info {
>  		}
>  		hw->vring[i].avail = gpa;
> 
> -		/* Direct I/O for Tx queue, relay for Rx queue */
> -		if (i & 1) {
> -			gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
> -			if (gpa == 0) {
> -				DRV_LOG(ERR, "Fail to get GPA for used ring.");
> -				return -1;
> +		if (internal->device_type == IFCVF_NET) {
> +			/* Direct I/O for Tx queue, relay for Rx queue */
> +			if (i & 1) {
> +				gpa = hva_to_gpa(vid,
> (uint64_t)(uintptr_t)vq.used);
> +				if (gpa == 0) {
> +					DRV_LOG(ERR, "Fail to get GPA for used
> ring.");
> +					return -1;
> +				}
> +				hw->vring[i].used = gpa;
> +			} else {
> +				hw->vring[i].used = m_vring_iova +
> +					(char *)internal->m_vring[i].used -
> +					(char *)internal->m_vring[i].desc;
>  			}
> -			hw->vring[i].used = gpa;
> -		} else {
> +		} else if (internal->device_type == IFCVF_BLK) {
> +			/* BLK: relay every queue */
>  			hw->vring[i].used = m_vring_iova +
>  				(char *)internal->m_vring[i].used -
>  				(char *)internal->m_vring[i].desc;
> @@ -686,7 +743,10 @@ struct rte_vdpa_dev_info {
>  	}
>  	hw->nr_vring = nr_vring;
> 
> -	return ifcvf_start_hw(&internal->hw);
> +	if (internal->device_type == IFCVF_NET)
> +		return ifcvf_start_hw(&internal->hw);
> +	else if (internal->device_type == IFCVF_BLK)
> +		return ifcvf_hw_enable(&internal->hw);
> 
>  error:
>  	for (i = 0; i < nr_vring; i++)
> @@ -710,8 +770,12 @@ struct rte_vdpa_dev_info {
> 
>  	for (i = 0; i < hw->nr_vring; i++) {
>  		/* synchronize remaining new used entries if any */
> -		if ((i & 1) == 0)
> +		if (internal->device_type == IFCVF_NET) {
> +			if ((i & 1) == 0)
> +				update_used_ring(internal, i);
> +		} else if (internal->device_type == IFCVF_BLK) {
>  			update_used_ring(internal, i);
> +		}
> 
>  		rte_vhost_get_vhost_vring(vid, i, &vq);
>  		len = IFCVF_USED_RING_LEN(vq.size);
> @@ -773,17 +837,36 @@ struct rte_vdpa_dev_info {
>  		}
>  	}
> 
> -	for (qid = 0; qid < q_num; qid += 2) {
> -		ev.events = EPOLLIN | EPOLLPRI;
> -		/* leave a flag to mark it's for interrupt */
> -		ev.data.u64 = 1 | qid << 1 |
> -			(uint64_t)internal->intr_fd[qid] << 32;
> -		if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
> -				< 0) {
> -			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> -			return NULL;
> +	if (internal->device_type == IFCVF_NET) {
> +		for (qid = 0; qid < q_num; qid += 2) {
> +			ev.events = EPOLLIN | EPOLLPRI;
> +			/* leave a flag to mark it's for interrupt */
> +			ev.data.u64 = 1 | qid << 1 |
> +				(uint64_t)internal->intr_fd[qid] << 32;
> +			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
> +				      internal->intr_fd[qid], &ev)
> +					< 0) {
> +				DRV_LOG(ERR, "epoll add error: %s",
> +					strerror(errno));
> +				return NULL;
> +			}
> +			update_used_ring(internal, qid);
> +		}
> +	} else if (internal->device_type == IFCVF_BLK) {
> +		for (qid = 0; qid < q_num; qid += 1) {
> +			ev.events = EPOLLIN | EPOLLPRI;
> +			/* leave a flag to mark it's for interrupt */
> +			ev.data.u64 = 1 | qid << 1 |
> +				(uint64_t)internal->intr_fd[qid] << 32;
> +			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
> +				      internal->intr_fd[qid], &ev)
> +					< 0) {
> +				DRV_LOG(ERR, "epoll add error: %s",
> +					strerror(errno));
> +				return NULL;
> +			}
> +			update_used_ring(internal, qid);

It seems we can also reduce duplicate code for above case. And for other checks,
if we can use only one combined condition to check, I prefer to just use one.

Thanks,
Chenbo

>  		}
> -		update_used_ring(internal, qid);
>  	}
> 
>  	/* start relay with a first kick */
> @@ -871,7 +954,10 @@ struct rte_vdpa_dev_info {
> 
>  	/* stop the direct IO data path */
>  	unset_notify_relay(internal);
> -	vdpa_ifcvf_stop(internal);
> +	if (internal->device_type == IFCVF_NET)
> +		vdpa_ifcvf_stop(internal);
> +	else if (internal->device_type == IFCVF_BLK)
> +		vdpa_ifcvf_blk_pause(internal);
>  	vdpa_disable_vfio_intr(internal);
> 
>  	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
> --
> 1.8.3.1
  
Pei, Andy April 26, 2022, 10:07 a.m. UTC | #2
Hi Chenbo,

Thanks for your reply.
My reply is inline.

> -----Original Message-----
> From: Xia, Chenbo <chenbo.xia@intel.com>
> Sent: Monday, April 25, 2022 9:10 PM
> To: Pei, Andy <andy.pei@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; Cao, Gang <gang.cao@intel.com>; Liu,
> Changpeng <changpeng.liu@intel.com>
> Subject: RE: [PATCH v6 06/16] vdpa/ifc: add block device SW live-migration
> 
> > -----Original Message-----
> > From: Pei, Andy <andy.pei@intel.com>
> > Sent: Thursday, April 21, 2022 4:34 PM
> > To: dev@dpdk.org
> > Cc: Xia, Chenbo <chenbo.xia@intel.com>; maxime.coquelin@redhat.com;
> > Cao, Gang <gang.cao@intel.com>; Liu, Changpeng
> > <changpeng.liu@intel.com>
> > Subject: [PATCH v6 06/16] vdpa/ifc: add block device SW live-migration
> >
> > Add SW live-migration support to block device.
> > Add dirty page logging to block device.
> 
> Add SW live-migration support including dirty page logging for block device.
> 
Sure, I will remove " Add dirty page logging to block device." In next version.
> >
> > Signed-off-by: Andy Pei <andy.pei@intel.com>
> > ---
> >  drivers/vdpa/ifc/base/ifcvf.c |   4 +-
> >  drivers/vdpa/ifc/base/ifcvf.h |   6 ++
> >  drivers/vdpa/ifc/ifcvf_vdpa.c | 128
> > +++++++++++++++++++++++++++++++++++--
> > -----
> >  3 files changed, 115 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/vdpa/ifc/base/ifcvf.c
> > b/drivers/vdpa/ifc/base/ifcvf.c index d10c1fd..e417c50 100644
> > --- a/drivers/vdpa/ifc/base/ifcvf.c
> > +++ b/drivers/vdpa/ifc/base/ifcvf.c
> > @@ -191,7 +191,7 @@
> >  	IFCVF_WRITE_REG32(val >> 32, hi);
> >  }
> >
> > -STATIC int
> > +int
> >  ifcvf_hw_enable(struct ifcvf_hw *hw)
> >  {
> >  	struct ifcvf_pci_common_cfg *cfg;
> > @@ -240,7 +240,7 @@
> >  	return 0;
> >  }
> >
> > -STATIC void
> > +void
> >  ifcvf_hw_disable(struct ifcvf_hw *hw)  {
> >  	u32 i;
> > diff --git a/drivers/vdpa/ifc/base/ifcvf.h
> > b/drivers/vdpa/ifc/base/ifcvf.h index 769c603..6dd7925 100644
> > --- a/drivers/vdpa/ifc/base/ifcvf.h
> > +++ b/drivers/vdpa/ifc/base/ifcvf.h
> > @@ -179,4 +179,10 @@ struct ifcvf_hw {
> >  u64
> >  ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
> >
> > +int
> > +ifcvf_hw_enable(struct ifcvf_hw *hw);
> > +
> > +void
> > +ifcvf_hw_disable(struct ifcvf_hw *hw);
> > +
> >  #endif /* _IFCVF_H_ */
> > diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c
> > b/drivers/vdpa/ifc/ifcvf_vdpa.c index 8d104b7..a23dc2d 100644
> > --- a/drivers/vdpa/ifc/ifcvf_vdpa.c
> > +++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
> > @@ -345,6 +345,56 @@ struct rte_vdpa_dev_info {
> >  	}
> >  }
> >
> > +static void
> > +vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal) {
> > +	struct ifcvf_hw *hw = &internal->hw;
> > +	struct rte_vhost_vring vq;
> > +	int i, vid;
> > +	uint64_t features = 0;
> > +	uint64_t log_base = 0, log_size = 0;
> > +	uint64_t len;
> > +
> > +	vid = internal->vid;
> > +
> > +	if (internal->device_type == IFCVF_BLK) {
> > +		for (i = 0; i < hw->nr_vring; i++) {
> > +			rte_vhost_get_vhost_vring(internal->vid, i, &vq);
> > +			while (vq.avail->idx != vq.used->idx) {
> > +				ifcvf_notify_queue(hw, i);
> > +				usleep(10);
> > +			}
> > +			hw->vring[i].last_avail_idx = vq.avail->idx;
> > +			hw->vring[i].last_used_idx = vq.used->idx;
> > +		}
> > +	}
> > +
> > +	ifcvf_hw_disable(hw);
> > +
> > +	for (i = 0; i < hw->nr_vring; i++)
> > +		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
> > +				hw->vring[i].last_used_idx);
> > +
> > +	if (internal->sw_lm)
> > +		return;
> > +
> > +	rte_vhost_get_negotiated_features(vid, &features);
> > +	if (RTE_VHOST_NEED_LOG(features)) {
> > +		ifcvf_disable_logging(hw);
> > +		rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
> > +		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
> > +				log_base, IFCVF_LOG_BASE, log_size);
> > +		/*
> > +		 * IFCVF marks dirty memory pages for only packet buffer,
> > +		 * SW helps to mark the used ring as dirty after device stops.
> > +		 */
> > +		for (i = 0; i < hw->nr_vring; i++) {
> > +			len = IFCVF_USED_RING_LEN(hw->vring[i].size);
> > +			rte_vhost_log_used_vring(vid, i, 0, len);
> > +		}
> > +	}
> > +}
> 
> Can we consider combining vdpa_ifcvf_blk_pause and vdpa_ifcvf_stop to
> one function and check device type internally to do different things? Because
> as I see, most logic is the same.
> 
OK, I will address it in next version.
> > +
> >  #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
> >  		sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))  static int @@ -
> 659,15
> > +709,22 @@ struct rte_vdpa_dev_info {
> >  		}
> >  		hw->vring[i].avail = gpa;
> >
> > -		/* Direct I/O for Tx queue, relay for Rx queue */
> > -		if (i & 1) {
> > -			gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
> > -			if (gpa == 0) {
> > -				DRV_LOG(ERR, "Fail to get GPA for used
> ring.");
> > -				return -1;
> > +		if (internal->device_type == IFCVF_NET) {
> > +			/* Direct I/O for Tx queue, relay for Rx queue */
> > +			if (i & 1) {
> > +				gpa = hva_to_gpa(vid,
> > (uint64_t)(uintptr_t)vq.used);
> > +				if (gpa == 0) {
> > +					DRV_LOG(ERR, "Fail to get GPA for
> used
> > ring.");
> > +					return -1;
> > +				}
> > +				hw->vring[i].used = gpa;
> > +			} else {
> > +				hw->vring[i].used = m_vring_iova +
> > +					(char *)internal->m_vring[i].used -
> > +					(char *)internal->m_vring[i].desc;
> >  			}
> > -			hw->vring[i].used = gpa;
> > -		} else {
> > +		} else if (internal->device_type == IFCVF_BLK) {
> > +			/* BLK: relay every queue */
> >  			hw->vring[i].used = m_vring_iova +
> >  				(char *)internal->m_vring[i].used -
> >  				(char *)internal->m_vring[i].desc; @@ -686,7
> +743,10 @@ struct
> > rte_vdpa_dev_info {
> >  	}
> >  	hw->nr_vring = nr_vring;
> >
> > -	return ifcvf_start_hw(&internal->hw);
> > +	if (internal->device_type == IFCVF_NET)
> > +		return ifcvf_start_hw(&internal->hw);
> > +	else if (internal->device_type == IFCVF_BLK)
> > +		return ifcvf_hw_enable(&internal->hw);
> >
> >  error:
> >  	for (i = 0; i < nr_vring; i++)
> > @@ -710,8 +770,12 @@ struct rte_vdpa_dev_info {
> >
> >  	for (i = 0; i < hw->nr_vring; i++) {
> >  		/* synchronize remaining new used entries if any */
> > -		if ((i & 1) == 0)
> > +		if (internal->device_type == IFCVF_NET) {
> > +			if ((i & 1) == 0)
> > +				update_used_ring(internal, i);
> > +		} else if (internal->device_type == IFCVF_BLK) {
> >  			update_used_ring(internal, i);
> > +		}
> >
> >  		rte_vhost_get_vhost_vring(vid, i, &vq);
> >  		len = IFCVF_USED_RING_LEN(vq.size); @@ -773,17 +837,36
> @@ struct
> > rte_vdpa_dev_info {
> >  		}
> >  	}
> >
> > -	for (qid = 0; qid < q_num; qid += 2) {
> > -		ev.events = EPOLLIN | EPOLLPRI;
> > -		/* leave a flag to mark it's for interrupt */
> > -		ev.data.u64 = 1 | qid << 1 |
> > -			(uint64_t)internal->intr_fd[qid] << 32;
> > -		if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid],
> &ev)
> > -				< 0) {
> > -			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> > -			return NULL;
> > +	if (internal->device_type == IFCVF_NET) {
> > +		for (qid = 0; qid < q_num; qid += 2) {
> > +			ev.events = EPOLLIN | EPOLLPRI;
> > +			/* leave a flag to mark it's for interrupt */
> > +			ev.data.u64 = 1 | qid << 1 |
> > +				(uint64_t)internal->intr_fd[qid] << 32;
> > +			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
> > +				      internal->intr_fd[qid], &ev)
> > +					< 0) {
> > +				DRV_LOG(ERR, "epoll add error: %s",
> > +					strerror(errno));
> > +				return NULL;
> > +			}
> > +			update_used_ring(internal, qid);
> > +		}
> > +	} else if (internal->device_type == IFCVF_BLK) {
> > +		for (qid = 0; qid < q_num; qid += 1) {
> > +			ev.events = EPOLLIN | EPOLLPRI;
> > +			/* leave a flag to mark it's for interrupt */
> > +			ev.data.u64 = 1 | qid << 1 |
> > +				(uint64_t)internal->intr_fd[qid] << 32;
> > +			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
> > +				      internal->intr_fd[qid], &ev)
> > +					< 0) {
> > +				DRV_LOG(ERR, "epoll add error: %s",
> > +					strerror(errno));
> > +				return NULL;
> > +			}
> > +			update_used_ring(internal, qid);
> 
> It seems we can also reduce duplicate code for above case. And for other
> checks, if we can use only one combined condition to check, I prefer to just
> use one.
> 
OK, I will address it in next version.
> Thanks,
> Chenbo
> 
> >  		}
> > -		update_used_ring(internal, qid);
> >  	}
> >
> >  	/* start relay with a first kick */
> > @@ -871,7 +954,10 @@ struct rte_vdpa_dev_info {
> >
> >  	/* stop the direct IO data path */
> >  	unset_notify_relay(internal);
> > -	vdpa_ifcvf_stop(internal);
> > +	if (internal->device_type == IFCVF_NET)
> > +		vdpa_ifcvf_stop(internal);
> > +	else if (internal->device_type == IFCVF_BLK)
> > +		vdpa_ifcvf_blk_pause(internal);
> >  	vdpa_disable_vfio_intr(internal);
> >
> >  	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL,
> false);
> > --
> > 1.8.3.1
>
  

Patch

diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c
index d10c1fd..e417c50 100644
--- a/drivers/vdpa/ifc/base/ifcvf.c
+++ b/drivers/vdpa/ifc/base/ifcvf.c
@@ -191,7 +191,7 @@ 
 	IFCVF_WRITE_REG32(val >> 32, hi);
 }
 
-STATIC int
+int
 ifcvf_hw_enable(struct ifcvf_hw *hw)
 {
 	struct ifcvf_pci_common_cfg *cfg;
@@ -240,7 +240,7 @@ 
 	return 0;
 }
 
-STATIC void
+void
 ifcvf_hw_disable(struct ifcvf_hw *hw)
 {
 	u32 i;
diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 769c603..6dd7925 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -179,4 +179,10 @@  struct ifcvf_hw {
 u64
 ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
 
+int
+ifcvf_hw_enable(struct ifcvf_hw *hw);
+
+void
+ifcvf_hw_disable(struct ifcvf_hw *hw);
+
 #endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 8d104b7..a23dc2d 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -345,6 +345,56 @@  struct rte_vdpa_dev_info {
 	}
 }
 
+static void
+vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal)
+{
+	struct ifcvf_hw *hw = &internal->hw;
+	struct rte_vhost_vring vq;
+	int i, vid;
+	uint64_t features = 0;
+	uint64_t log_base = 0, log_size = 0;
+	uint64_t len;
+
+	vid = internal->vid;
+
+	if (internal->device_type == IFCVF_BLK) {
+		for (i = 0; i < hw->nr_vring; i++) {
+			rte_vhost_get_vhost_vring(internal->vid, i, &vq);
+			while (vq.avail->idx != vq.used->idx) {
+				ifcvf_notify_queue(hw, i);
+				usleep(10);
+			}
+			hw->vring[i].last_avail_idx = vq.avail->idx;
+			hw->vring[i].last_used_idx = vq.used->idx;
+		}
+	}
+
+	ifcvf_hw_disable(hw);
+
+	for (i = 0; i < hw->nr_vring; i++)
+		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+				hw->vring[i].last_used_idx);
+
+	if (internal->sw_lm)
+		return;
+
+	rte_vhost_get_negotiated_features(vid, &features);
+	if (RTE_VHOST_NEED_LOG(features)) {
+		ifcvf_disable_logging(hw);
+		rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
+		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
+				log_base, IFCVF_LOG_BASE, log_size);
+		/*
+		 * IFCVF marks dirty memory pages for only packet buffer,
+		 * SW helps to mark the used ring as dirty after device stops.
+		 */
+		for (i = 0; i < hw->nr_vring; i++) {
+			len = IFCVF_USED_RING_LEN(hw->vring[i].size);
+			rte_vhost_log_used_vring(vid, i, 0, len);
+		}
+	}
+}
+
 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
 		sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
 static int
@@ -659,15 +709,22 @@  struct rte_vdpa_dev_info {
 		}
 		hw->vring[i].avail = gpa;
 
-		/* Direct I/O for Tx queue, relay for Rx queue */
-		if (i & 1) {
-			gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
-			if (gpa == 0) {
-				DRV_LOG(ERR, "Fail to get GPA for used ring.");
-				return -1;
+		if (internal->device_type == IFCVF_NET) {
+			/* Direct I/O for Tx queue, relay for Rx queue */
+			if (i & 1) {
+				gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
+				if (gpa == 0) {
+					DRV_LOG(ERR, "Fail to get GPA for used ring.");
+					return -1;
+				}
+				hw->vring[i].used = gpa;
+			} else {
+				hw->vring[i].used = m_vring_iova +
+					(char *)internal->m_vring[i].used -
+					(char *)internal->m_vring[i].desc;
 			}
-			hw->vring[i].used = gpa;
-		} else {
+		} else if (internal->device_type == IFCVF_BLK) {
+			/* BLK: relay every queue */
 			hw->vring[i].used = m_vring_iova +
 				(char *)internal->m_vring[i].used -
 				(char *)internal->m_vring[i].desc;
@@ -686,7 +743,10 @@  struct rte_vdpa_dev_info {
 	}
 	hw->nr_vring = nr_vring;
 
-	return ifcvf_start_hw(&internal->hw);
+	if (internal->device_type == IFCVF_NET)
+		return ifcvf_start_hw(&internal->hw);
+	else if (internal->device_type == IFCVF_BLK)
+		return ifcvf_hw_enable(&internal->hw);
 
 error:
 	for (i = 0; i < nr_vring; i++)
@@ -710,8 +770,12 @@  struct rte_vdpa_dev_info {
 
 	for (i = 0; i < hw->nr_vring; i++) {
 		/* synchronize remaining new used entries if any */
-		if ((i & 1) == 0)
+		if (internal->device_type == IFCVF_NET) {
+			if ((i & 1) == 0)
+				update_used_ring(internal, i);
+		} else if (internal->device_type == IFCVF_BLK) {
 			update_used_ring(internal, i);
+		}
 
 		rte_vhost_get_vhost_vring(vid, i, &vq);
 		len = IFCVF_USED_RING_LEN(vq.size);
@@ -773,17 +837,36 @@  struct rte_vdpa_dev_info {
 		}
 	}
 
-	for (qid = 0; qid < q_num; qid += 2) {
-		ev.events = EPOLLIN | EPOLLPRI;
-		/* leave a flag to mark it's for interrupt */
-		ev.data.u64 = 1 | qid << 1 |
-			(uint64_t)internal->intr_fd[qid] << 32;
-		if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
-				< 0) {
-			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
-			return NULL;
+	if (internal->device_type == IFCVF_NET) {
+		for (qid = 0; qid < q_num; qid += 2) {
+			ev.events = EPOLLIN | EPOLLPRI;
+			/* leave a flag to mark it's for interrupt */
+			ev.data.u64 = 1 | qid << 1 |
+				(uint64_t)internal->intr_fd[qid] << 32;
+			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
+				      internal->intr_fd[qid], &ev)
+					< 0) {
+				DRV_LOG(ERR, "epoll add error: %s",
+					strerror(errno));
+				return NULL;
+			}
+			update_used_ring(internal, qid);
+		}
+	} else if (internal->device_type == IFCVF_BLK) {
+		for (qid = 0; qid < q_num; qid += 1) {
+			ev.events = EPOLLIN | EPOLLPRI;
+			/* leave a flag to mark it's for interrupt */
+			ev.data.u64 = 1 | qid << 1 |
+				(uint64_t)internal->intr_fd[qid] << 32;
+			if (epoll_ctl(epfd, EPOLL_CTL_ADD,
+				      internal->intr_fd[qid], &ev)
+					< 0) {
+				DRV_LOG(ERR, "epoll add error: %s",
+					strerror(errno));
+				return NULL;
+			}
+			update_used_ring(internal, qid);
 		}
-		update_used_ring(internal, qid);
 	}
 
 	/* start relay with a first kick */
@@ -871,7 +954,10 @@  struct rte_vdpa_dev_info {
 
 	/* stop the direct IO data path */
 	unset_notify_relay(internal);
-	vdpa_ifcvf_stop(internal);
+	if (internal->device_type == IFCVF_NET)
+		vdpa_ifcvf_stop(internal);
+	else if (internal->device_type == IFCVF_BLK)
+		vdpa_ifcvf_blk_pause(internal);
 	vdpa_disable_vfio_intr(internal);
 
 	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);