[dpdk-dev] examples/vhost: Support jumbo frame in user space vhost

Message ID 6BD6202160B55B409D423293115822625483C6@SHSMSX101.ccr.corp.intel.com (mailing list archive)
State Not Applicable, archived
Headers

Commit Message

Fu, JingguoX Sept. 24, 2014, 9:25 a.m. UTC
Tested-by: Jingguo Fu <jingguox.fu at intel.com>

This patch includes 1 file, and has been tested by Intel.
Please see information as the following:

Host:
Fedora 19 x86_64, Linux Kernel 3.9.0, GCC 4.8.2  Intel Xeon CPU E5-2680 v2 @ 2.80GHz
 NIC: Intel Niantic 82599, Intel i350, Intel 82580 and Intel 82576

Guest:
Fedora 16 x86_64, Linux Kernel 3.4.2, GCC 4.6.3 Qemu emulator 1.4.2

We verified zero copy and one copy functional test and performance test, that is regression test with front end support jumbo frame 
We verified jumbo frame support on front end, with linux legacy back end.
We verified jumbo frame support on front end, with vhost backend

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ouyang Changchun
Sent: Friday, August 15, 2014 12:58
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH] examples/vhost: Support jumbo frame in user space vhost

This patch support mergeable RX feature and thus support jumbo frame RX and TX
in user space vhost(as virtio backend).
 
On RX, it secures enough room from vring to accommodate one complete scattered
packet which is received by PMD from physical port, and then copy data from
mbuf to vring buffer, possibly across a few vring entries and descriptors.
 
On TX, it gets a jumbo frame, possibly described by a few vring descriptors which
are chained together with the flags of 'NEXT', and then copy them into one scattered
packet and TX it to physical port through PMD.

Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
Acked-by: Huawei Xie <huawei.xie@intel.com>
---
 examples/vhost/main.c       | 726 ++++++++++++++++++++++++++++++++++++++++----
 examples/vhost/virtio-net.h |  14 +
 2 files changed, 687 insertions(+), 53 deletions(-)
  

Patch

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 193aa25..7d9e6a2 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -106,6 +106,8 @@ 
 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
 
+#define JUMBO_FRAME_MAX_SIZE    0x2600
+
 /* State of virtio device. */
 #define DEVICE_MAC_LEARNING 0
 #define DEVICE_RX			1
@@ -676,8 +678,12 @@  us_vhost_parse_args(int argc, char **argv)
 					us_vhost_usage(prgname);
 					return -1;
 				} else {
-					if (ret)
+					if (ret) {
+						vmdq_conf_default.rxmode.jumbo_frame = 1;
+						vmdq_conf_default.rxmode.max_rx_pkt_len
+							= JUMBO_FRAME_MAX_SIZE;
 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
+					}
 				}
 			}
 
@@ -797,6 +803,14 @@  us_vhost_parse_args(int argc, char **argv)
 		return -1;
 	}
 
+	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
+		RTE_LOG(INFO, VHOST_PORT,
+			"Vhost zero copy doesn't support jumbo frame,"
+			"please specify '--mergeable 0' to disable the "
+			"mergeable feature.\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -916,7 +930,7 @@  gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtio device. A packet
  * count is returned to indicate the number of packets that were succesfully
- * added to the RX queue.
+ * added to the RX queue. This function works when mergeable is disabled.
  */
 static inline uint32_t __attribute__((always_inline))
 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
@@ -930,7 +944,6 @@  virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	uint64_t buff_hdr_addr = 0;
 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
 	uint32_t head_idx, packet_success = 0;
-	uint32_t mergeable, mrg_count = 0;
 	uint32_t retry = 0;
 	uint16_t avail_idx, res_cur_idx;
 	uint16_t res_base_idx, res_end_idx;
@@ -940,6 +953,7 @@  virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
 	vq = dev->virtqueue[VIRTIO_RXQ];
 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
+
 	/* As many data cores may want access to available buffers, they need to be reserved. */
 	do {
 		res_base_idx = vq->last_used_idx_res;
@@ -976,9 +990,6 @@  virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	/* Prefetch available ring to retrieve indexes. */
 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
 
-	/* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
-	mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
-
 	/* Retrieve all of the head indexes first to avoid caching issues. */
 	for (head_idx = 0; head_idx < count; head_idx++)
 		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
@@ -997,56 +1008,44 @@  virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 		/* Prefetch buffer address. */
 		rte_prefetch0((void*)(uintptr_t)buff_addr);
 
-		if (mergeable && (mrg_count != 0)) {
-			desc->len = packet_len = rte_pktmbuf_data_len(buff);
-		} else {
-			/* Copy virtio_hdr to packet and increment buffer address */
-			buff_hdr_addr = buff_addr;
-			packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+		/* Copy virtio_hdr to packet and increment buffer address */
+		buff_hdr_addr = buff_addr;
+		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
 
-			/*
-			 * If the descriptors are chained the header and data are placed in
-			 * separate buffers.
-			 */
-			if (desc->flags & VRING_DESC_F_NEXT) {
-				desc->len = vq->vhost_hlen;
-				desc = &vq->desc[desc->next];
-				/* Buffer address translation. */
-				buff_addr = gpa_to_vva(dev, desc->addr);
-				desc->len = rte_pktmbuf_data_len(buff);
-			} else {
-				buff_addr += vq->vhost_hlen;
-				desc->len = packet_len;
-			}
+		/*
+		 * If the descriptors are chained the header and data are
+		 * placed in separate buffers.
+		 */
+		if (desc->flags & VRING_DESC_F_NEXT) {
+			desc->len = vq->vhost_hlen;
+			desc = &vq->desc[desc->next];
+			/* Buffer address translation. */
+			buff_addr = gpa_to_vva(dev, desc->addr);
+			desc->len = rte_pktmbuf_data_len(buff);
+		} else {
+			buff_addr += vq->vhost_hlen;
+			desc->len = packet_len;
 		}
 
-		PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0);
-
 		/* Update used ring with desc information */
 		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
 		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
 
 		/* Copy mbuf data to buffer */
-		rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
+		rte_memcpy((void *)(uintptr_t)buff_addr,
+			(const void *)buff->pkt.data,
+			rte_pktmbuf_data_len(buff));
+		PRINT_PACKET(dev, (uintptr_t)buff_addr,
+			rte_pktmbuf_data_len(buff), 0);
 
 		res_cur_idx++;
 		packet_success++;
 
-		/* If mergeable is disabled then a header is required per buffer. */
-		if (!mergeable) {
-			rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
-			PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-		} else {
-			mrg_count++;
-			/* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
-			if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
-				virtio_hdr.num_buffers = mrg_count;
-				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
-				rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
-				PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-				mrg_count = 0;
-			}
-		}
+		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
+			(const void *)&virtio_hdr, vq->vhost_hlen);
+
+		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
+
 		if (res_cur_idx < res_end_idx) {
 			/* Prefetch descriptor index. */
 			rte_prefetch0(&vq->desc[head[packet_success]]);
@@ -1068,6 +1067,356 @@  virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	return count;
 }
 
+static inline uint32_t __attribute__((always_inline))
+copy_from_mbuf_to_vring(struct virtio_net *dev,
+	uint16_t res_base_idx, uint16_t res_end_idx,
+	struct rte_mbuf *pkt)
+{
+	uint32_t vec_idx = 0;
+	uint32_t entry_success = 0;
+	struct vhost_virtqueue *vq;
+	/* The virtio_hdr is initialised to 0. */
+	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
+		{0, 0, 0, 0, 0, 0}, 0};
+	uint16_t cur_idx = res_base_idx;
+	uint64_t vb_addr = 0;
+	uint64_t vb_hdr_addr = 0;
+	uint32_t seg_offset = 0;
+	uint32_t vb_offset = 0;
+	uint32_t seg_avail;
+	uint32_t vb_avail;
+	uint32_t cpy_len, entry_len;
+
+	if (pkt == NULL)
+		return 0;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
+		"End Index %d\n",
+		dev->device_fh, cur_idx, res_end_idx);
+
+	/*
+	 * Convert from gpa to vva
+	 * (guest physical addr -> vhost virtual addr)
+	 */
+	vq = dev->virtqueue[VIRTIO_RXQ];
+	vb_addr =
+		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+	vb_hdr_addr = vb_addr;
+
+	/* Prefetch buffer address. */
+	rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+	virtio_hdr.num_buffers = res_end_idx - res_base_idx;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
+		dev->device_fh, virtio_hdr.num_buffers);
+
+	rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
+		(const void *)&virtio_hdr, vq->vhost_hlen);
+
+	PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
+
+	seg_avail = rte_pktmbuf_data_len(pkt);
+	vb_offset = vq->vhost_hlen;
+	vb_avail =
+		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
+
+	entry_len = vq->vhost_hlen;
+
+	if (vb_avail == 0) {
+		uint32_t desc_idx =
+			vq->buf_vec[vec_idx].desc_idx;
+		vq->desc[desc_idx].len = vq->vhost_hlen;
+
+		if ((vq->desc[desc_idx].flags
+			& VRING_DESC_F_NEXT) == 0) {
+			/* Update used ring with desc information */
+			vq->used->ring[cur_idx & (vq->size - 1)].id
+				= vq->buf_vec[vec_idx].desc_idx;
+			vq->used->ring[cur_idx & (vq->size - 1)].len
+				= entry_len;
+
+			entry_len = 0;
+			cur_idx++;
+			entry_success++;
+		}
+
+		vec_idx++;
+		vb_addr =
+			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+
+		/* Prefetch buffer address. */
+		rte_prefetch0((void *)(uintptr_t)vb_addr);
+		vb_offset = 0;
+		vb_avail = vq->buf_vec[vec_idx].buf_len;
+	}
+
+	cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+	while (cpy_len > 0) {
+		/* Copy mbuf data to vring buffer */
+		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
+			(const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
+			cpy_len);
+
+		PRINT_PACKET(dev,
+			(uintptr_t)(vb_addr + vb_offset),
+			cpy_len, 0);
+
+		seg_offset += cpy_len;
+		vb_offset += cpy_len;
+		seg_avail -= cpy_len;
+		vb_avail -= cpy_len;
+		entry_len += cpy_len;
+
+		if (seg_avail != 0) {
+			/*
+			 * The virtio buffer in this vring
+			 * entry reach to its end.
+			 * But the segment doesn't complete.
+			 */
+			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
+				VRING_DESC_F_NEXT) == 0) {
+				/* Update used ring with desc information */
+				vq->used->ring[cur_idx & (vq->size - 1)].id
+					= vq->buf_vec[vec_idx].desc_idx;
+				vq->used->ring[cur_idx & (vq->size - 1)].len
+					= entry_len;
+				entry_len = 0;
+				cur_idx++;
+				entry_success++;
+			}
+
+			vec_idx++;
+			vb_addr = gpa_to_vva(dev,
+				vq->buf_vec[vec_idx].buf_addr);
+			vb_offset = 0;
+			vb_avail = vq->buf_vec[vec_idx].buf_len;
+			cpy_len = RTE_MIN(vb_avail, seg_avail);
+		} else {
+			/*
+			 * This current segment complete, need continue to
+			 * check if the whole packet complete or not.
+			 */
+			pkt = pkt->pkt.next;
+			if (pkt != NULL) {
+				/*
+				 * There are more segments.
+				 */
+				if (vb_avail == 0) {
+					/*
+					 * This current buffer from vring is
+					 * used up, need fetch next buffer
+					 * from buf_vec.
+					 */
+					uint32_t desc_idx =
+						vq->buf_vec[vec_idx].desc_idx;
+					vq->desc[desc_idx].len = vb_offset;
+
+					if ((vq->desc[desc_idx].flags &
+						VRING_DESC_F_NEXT) == 0) {
+						uint16_t wrapped_idx =
+							cur_idx & (vq->size - 1);
+						/*
+						 * Update used ring with the
+						 * descriptor information
+						 */
+						vq->used->ring[wrapped_idx].id
+							= desc_idx;
+						vq->used->ring[wrapped_idx].len
+							= entry_len;
+						entry_success++;
+						entry_len = 0;
+						cur_idx++;
+					}
+
+					/* Get next buffer from buf_vec. */
+					vec_idx++;
+					vb_addr = gpa_to_vva(dev,
+						vq->buf_vec[vec_idx].buf_addr);
+					vb_avail =
+						vq->buf_vec[vec_idx].buf_len;
+					vb_offset = 0;
+				}
+
+				seg_offset = 0;
+				seg_avail = rte_pktmbuf_data_len(pkt);
+				cpy_len = RTE_MIN(vb_avail, seg_avail);
+			} else {
+				/*
+				 * This whole packet completes.
+				 */
+				uint32_t desc_idx =
+					vq->buf_vec[vec_idx].desc_idx;
+				vq->desc[desc_idx].len = vb_offset;
+
+				while (vq->desc[desc_idx].flags &
+					VRING_DESC_F_NEXT) {
+					desc_idx = vq->desc[desc_idx].next;
+					 vq->desc[desc_idx].len = 0;
+				}
+
+				/* Update used ring with desc information */
+				vq->used->ring[cur_idx & (vq->size - 1)].id
+					= vq->buf_vec[vec_idx].desc_idx;
+				vq->used->ring[cur_idx & (vq->size - 1)].len
+					= entry_len;
+				entry_len = 0;
+				cur_idx++;
+				entry_success++;
+				seg_avail = 0;
+				cpy_len = RTE_MIN(vb_avail, seg_avail);
+			}
+		}
+	}
+
+	return entry_success;
+}
+
+/*
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue. This function works for mergeable RX.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
+	uint32_t count)
+{
+	struct vhost_virtqueue *vq;
+	uint32_t pkt_idx = 0, entry_success = 0;
+	uint32_t retry = 0;
+	uint16_t avail_idx, res_cur_idx;
+	uint16_t res_base_idx, res_end_idx;
+	uint8_t success = 0;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
+		dev->device_fh);
+	vq = dev->virtqueue[VIRTIO_RXQ];
+	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+
+	if (count == 0)
+		return 0;
+
+	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+		uint32_t secure_len = 0;
+		uint16_t need_cnt;
+		uint32_t vec_idx = 0;
+		uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq->vhost_hlen;
+		uint16_t i, id;
+
+		do {
+			/*
+			 * As many data cores may want access to available
+			 * buffers, they need to be reserved.
+			 */
+			res_base_idx = vq->last_used_idx_res;
+			res_cur_idx = res_base_idx;
+
+			do {
+				avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+				if (unlikely(res_cur_idx == avail_idx)) {
+					/*
+					 * If retry is enabled and the queue is
+					 * full then we wait and retry to avoid
+					 * packet loss.
+					 */
+					if (enable_retry) {
+						uint8_t cont = 0;
+						for (retry = 0; retry < burst_rx_retry_num; retry++) {
+							rte_delay_us(burst_rx_delay_time);
+							avail_idx =
+								*((volatile uint16_t *)&vq->avail->idx);
+							if (likely(res_cur_idx != avail_idx)) {
+								cont = 1;
+								break;
+							}
+						}
+						if (cont == 1)
+							continue;
+					}
+
+					LOG_DEBUG(VHOST_DATA,
+						"(%"PRIu64") Failed "
+						"to get enough desc from "
+						"vring\n",
+						dev->device_fh);
+					return pkt_idx;
+				} else {
+					uint16_t wrapped_idx =
+						(res_cur_idx) & (vq->size - 1);
+					uint32_t idx =
+						vq->avail->ring[wrapped_idx];
+					uint8_t next_desc;
+
+					do {
+						next_desc = 0;
+						secure_len += vq->desc[idx].len;
+						if (vq->desc[idx].flags &
+							VRING_DESC_F_NEXT) {
+							idx = vq->desc[idx].next;
+							next_desc = 1;
+						}
+					} while (next_desc);
+
+					res_cur_idx++;
+				}
+			} while (pkt_len > secure_len);
+
+			/* vq->last_used_idx_res is atomically updated. */
+			success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+							res_base_idx,
+							res_cur_idx);
+		} while (success == 0);
+
+		id = res_base_idx;
+		need_cnt = res_cur_idx - res_base_idx;
+
+		for (i = 0; i < need_cnt; i++, id++) {
+			uint16_t wrapped_idx = id & (vq->size - 1);
+			uint32_t idx = vq->avail->ring[wrapped_idx];
+			uint8_t next_desc;
+			do {
+				next_desc = 0;
+				vq->buf_vec[vec_idx].buf_addr =
+					vq->desc[idx].addr;
+				vq->buf_vec[vec_idx].buf_len =
+					vq->desc[idx].len;
+				vq->buf_vec[vec_idx].desc_idx = idx;
+				vec_idx++;
+
+				if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
+					idx = vq->desc[idx].next;
+					next_desc = 1;
+				}
+			} while (next_desc);
+		}
+
+		res_end_idx = res_cur_idx;
+
+		entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
+			res_end_idx, pkts[pkt_idx]);
+
+		rte_compiler_barrier();
+
+		/*
+		 * Wait until it's our turn to add our buffer
+		 * to the used ring.
+		 */
+		while (unlikely(vq->last_used_idx != res_base_idx))
+			rte_pause();
+
+		*(volatile uint16_t *)&vq->used->idx += entry_success;
+		vq->last_used_idx = res_end_idx;
+
+		/* Kick the guest if necessary. */
+		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+			eventfd_write((int)vq->kickfd, 1);
+	}
+
+	return count;
+}
+
 /*
  * Compares a packet destination MAC address to a device MAC address.
  */
@@ -1199,8 +1548,17 @@  virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
 				/*drop the packet if the device is marked for removal*/
 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
 			} else {
+				uint32_t mergeable =
+					dev_ll->dev->features &
+					(1 << VIRTIO_NET_F_MRG_RXBUF);
+
 				/*send the packet to the local virtio device*/
-				ret = virtio_dev_rx(dev_ll->dev, &m, 1);
+				if (likely(mergeable == 0))
+					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
+				else
+					ret = virtio_dev_merge_rx(dev_ll->dev,
+						&m, 1);
+
 				if (enable_stats) {
 					rte_atomic64_add(
 					&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
@@ -1231,7 +1589,7 @@  virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *
 	struct mbuf_table *tx_q;
 	struct vlan_ethhdr *vlan_hdr;
 	struct rte_mbuf **m_table;
-	struct rte_mbuf *mbuf;
+	struct rte_mbuf *mbuf, *prev;
 	unsigned len, ret, offset = 0;
 	const uint16_t lcore_id = rte_lcore_id();
 	struct virtio_net_data_ll *dev_ll = ll_root_used;
@@ -1284,12 +1642,14 @@  virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *
 	/* Allocate an mbuf and populate the structure. */
 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
 	if (unlikely(mbuf == NULL)) {
-		RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n");
+		RTE_LOG(ERR, VHOST_DATA,
+			"Failed to allocate memory for mbuf.\n");
 		return;
 	}
 
 	mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
-	mbuf->pkt.pkt_len = mbuf->pkt.data_len;
+	mbuf->pkt.pkt_len = m->pkt.pkt_len + VLAN_HLEN + offset;
+	mbuf->pkt.nb_segs = m->pkt.nb_segs;
 
 	/* Copy ethernet header to mbuf. */
 	rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
@@ -1304,6 +1664,29 @@  virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *
 	/* Copy the remaining packet contents to the mbuf. */
 	rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
 		(const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
+
+	/* Copy the remaining segments for the whole packet. */
+	prev = mbuf;
+	while (m->pkt.next) {
+		/* Allocate an mbuf and populate the structure. */
+		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
+		if (unlikely(next_mbuf == NULL)) {
+			rte_pktmbuf_free(mbuf);
+			RTE_LOG(ERR, VHOST_DATA,
+				"Failed to allocate memory for mbuf.\n");
+			return;
+		}
+
+		m = m->pkt.next;
+		prev->pkt.next = next_mbuf;
+		prev = next_mbuf;
+		next_mbuf->pkt.data_len = m->pkt.data_len;
+
+		/* Copy data to next mbuf. */
+		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
+			rte_pktmbuf_mtod(m, const void *), m->pkt.data_len);
+	}
+
 	tx_q->m_table[len] = mbuf;
 	len++;
 	if (enable_stats) {
@@ -1394,6 +1777,7 @@  virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
 
 		/* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
 		m.pkt.data_len = desc->len;
+		m.pkt.pkt_len = desc->len;
 		m.pkt.data = (void*)(uintptr_t)buff_addr;
 
 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
@@ -1420,6 +1804,227 @@  virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
 		eventfd_write((int)vq->kickfd, 1);
 }
 
+/* This function works for TX packets with mergeable feature enabled. */
+static inline void __attribute__((always_inline))
+virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
+{
+	struct rte_mbuf *m, *prev;
+	struct vhost_virtqueue *vq;
+	struct vring_desc *desc;
+	uint64_t vb_addr = 0;
+	uint32_t head[MAX_PKT_BURST];
+	uint32_t used_idx;
+	uint32_t i;
+	uint16_t free_entries, entry_success = 0;
+	uint16_t avail_idx;
+	uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
+			+ RTE_PKTMBUF_HEADROOM);
+
+	vq = dev->virtqueue[VIRTIO_TXQ];
+	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
+
+	/* If there are no available buffers then return. */
+	if (vq->last_used_idx == avail_idx)
+		return;
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
+		dev->device_fh);
+
+	/* Prefetch available ring to retrieve head indexes. */
+	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
+
+	/*get the number of free entries in the ring*/
+	free_entries = (avail_idx - vq->last_used_idx);
+
+	/* Limit to MAX_PKT_BURST. */
+	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
+
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
+		dev->device_fh, free_entries);
+	/* Retrieve all of the head indexes first to avoid caching issues. */
+	for (i = 0; i < free_entries; i++)
+		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
+
+	/* Prefetch descriptor index. */
+	rte_prefetch0(&vq->desc[head[entry_success]]);
+	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
+
+	while (entry_success < free_entries) {
+		uint32_t vb_avail, vb_offset;
+		uint32_t seg_avail, seg_offset;
+		uint32_t cpy_len;
+		uint32_t seg_num = 0;
+		struct rte_mbuf *cur;
+		uint8_t alloc_err = 0;
+
+		desc = &vq->desc[head[entry_success]];
+
+		/* Discard first buffer as it is the virtio header */
+		desc = &vq->desc[desc->next];
+
+		/* Buffer address translation. */
+		vb_addr = gpa_to_vva(dev, desc->addr);
+		/* Prefetch buffer address. */
+		rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+		used_idx = vq->last_used_idx & (vq->size - 1);
+
+		if (entry_success < (free_entries - 1)) {
+			/* Prefetch descriptor index. */
+			rte_prefetch0(&vq->desc[head[entry_success+1]]);
+			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
+		}
+
+		/* Update used index buffer information. */
+		vq->used->ring[used_idx].id = head[entry_success];
+		vq->used->ring[used_idx].len = 0;
+
+		vb_offset = 0;
+		vb_avail = desc->len;
+		seg_offset = 0;
+		seg_avail = buf_size;
+		cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+		PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
+
+		/* Allocate an mbuf and populate the structure. */
+		m = rte_pktmbuf_alloc(mbuf_pool);
+		if (unlikely(m == NULL)) {
+			RTE_LOG(ERR, VHOST_DATA,
+				"Failed to allocate memory for mbuf.\n");
+			return;
+		}
+
+		seg_num++;
+		cur = m;
+		prev = m;
+		while (cpy_len != 0) {
+			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
+				(void *)((uintptr_t)(vb_addr + vb_offset)),
+				cpy_len);
+
+			seg_offset += cpy_len;
+			vb_offset += cpy_len;
+			vb_avail -= cpy_len;
+			seg_avail -= cpy_len;
+
+			if (vb_avail != 0) {
+				/*
+				 * The segment reachs to its end,
+				 * while the virtio buffer in TX vring has
+				 * more data to be copied.
+				 */
+				cur->pkt.data_len = seg_offset;
+				m->pkt.pkt_len += seg_offset;
+				/* Allocate mbuf and populate the structure. */
+				cur = rte_pktmbuf_alloc(mbuf_pool);
+				if (unlikely(cur == NULL)) {
+					RTE_LOG(ERR, VHOST_DATA, "Failed to "
+						"allocate memory for mbuf.\n");
+					rte_pktmbuf_free(m);
+					alloc_err = 1;
+					break;
+				}
+
+				seg_num++;
+				prev->pkt.next = cur;
+				prev = cur;
+				seg_offset = 0;
+				seg_avail = buf_size;
+			} else {
+				if (desc->flags & VRING_DESC_F_NEXT) {
+					/*
+					 * There are more virtio buffers in
+					 * same vring entry need to be copied.
+					 */
+					if (seg_avail == 0) {
+						/*
+						 * The current segment hasn't
+						 * room to accomodate more
+						 * data.
+						 */
+						cur->pkt.data_len = seg_offset;
+						m->pkt.pkt_len += seg_offset;
+						/*
+						 * Allocate an mbuf and
+						 * populate the structure.
+						 */
+						cur = rte_pktmbuf_alloc(mbuf_pool);
+						if (unlikely(cur == NULL)) {
+							RTE_LOG(ERR,
+								VHOST_DATA,
+								"Failed to "
+								"allocate memory "
+								"for mbuf\n");
+							rte_pktmbuf_free(m);
+							alloc_err = 1;
+							break;
+						}
+						seg_num++;
+						prev->pkt.next = cur;
+						prev = cur;
+						seg_offset = 0;
+						seg_avail = buf_size;
+					}
+
+					desc = &vq->desc[desc->next];
+
+					/* Buffer address translation. */
+					vb_addr = gpa_to_vva(dev, desc->addr);
+					/* Prefetch buffer address. */
+					rte_prefetch0((void *)(uintptr_t)vb_addr);
+					vb_offset = 0;
+					vb_avail = desc->len;
+
+					PRINT_PACKET(dev, (uintptr_t)vb_addr,
+						desc->len, 0);
+				} else {
+					/* The whole packet completes. */
+					cur->pkt.data_len = seg_offset;
+					m->pkt.pkt_len += seg_offset;
+					vb_avail = 0;
+				}
+			}
+
+			cpy_len = RTE_MIN(vb_avail, seg_avail);
+		}
+
+		if (unlikely(alloc_err == 1))
+			break;
+
+		m->pkt.nb_segs = seg_num;
+
+		/*
+		 * If this is the first received packet we need to learn
+		 * the MAC and setup VMDQ
+		 */
+		if (dev->ready == DEVICE_MAC_LEARNING) {
+			if (dev->remove || (link_vmdq(dev, m) == -1)) {
+				/*
+				 * Discard frame if device is scheduled for
+				 * removal or a duplicate MAC address is found.
+				 */
+				entry_success = free_entries;
+				vq->last_used_idx += entry_success;
+				rte_pktmbuf_free(m);
+				break;
+			}
+		}
+
+		virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
+		vq->last_used_idx++;
+		entry_success++;
+		rte_pktmbuf_free(m);
+	}
+
+	rte_compiler_barrier();
+	vq->used->idx += entry_success;
+	/* Kick guest if required. */
+	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+		eventfd_write((int)vq->kickfd, 1);
+
+}
+
 /*
  * This function is called by each data core. It handles all RX/TX registered with the
  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
@@ -1440,8 +2045,9 @@  switch_worker(__attribute__((unused)) void *arg)
 	const uint16_t lcore_id = rte_lcore_id();
 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
 	uint16_t rx_count = 0;
+	uint32_t mergeable = 0;
 
-	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id);
+	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
 	lcore_ll = lcore_info[lcore_id].lcore_ll;
 	prev_tsc = 0;
 
@@ -1497,6 +2103,8 @@  switch_worker(__attribute__((unused)) void *arg)
 		while (dev_ll != NULL) {
 			/*get virtio device ID*/
 			dev = dev_ll->dev;
+			mergeable =
+				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
 
 			if (dev->remove) {
 				dev_ll = dev_ll->next;
@@ -1510,7 +2118,15 @@  switch_worker(__attribute__((unused)) void *arg)
 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
 
 				if (rx_count) {
-					ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
+					if (likely(mergeable == 0))
+						ret_count =
+							virtio_dev_rx(dev,
+							pkts_burst, rx_count);
+					else
+						ret_count =
+							virtio_dev_merge_rx(dev,
+							pkts_burst, rx_count);
+
 					if (enable_stats) {
 						rte_atomic64_add(
 						&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
@@ -1520,15 +2136,19 @@  switch_worker(__attribute__((unused)) void *arg)
 					}
 					while (likely(rx_count)) {
 						rx_count--;
-						rte_pktmbuf_free_seg(pkts_burst[rx_count]);
+						rte_pktmbuf_free(pkts_burst[rx_count]);
 					}
 
 				}
 			}
 
-			if (!dev->remove)
+			if (!dev->remove) {
 				/*Handle guest TX*/
-				virtio_dev_tx(dev, mbuf_pool);
+				if (likely(mergeable == 0))
+					virtio_dev_tx(dev, mbuf_pool);
+				else
+					virtio_dev_merge_tx(dev, mbuf_pool);
+			}
 
 			/*move to the next device in the list*/
 			dev_ll = dev_ll->next;
diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h
index 3d1f255..1a2f0dc 100644
--- a/examples/vhost/virtio-net.h
+++ b/examples/vhost/virtio-net.h
@@ -45,6 +45,18 @@ 
 /* Enum for virtqueue management. */
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
 
+#define BUF_VECTOR_MAX 256
+
+/*
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+*/
+struct buf_vector {
+uint64_t buf_addr;
+uint32_t buf_len;
+uint32_t desc_idx;
+};
+
 /*
  * Structure contains variables relevant to TX/RX virtqueues.
  */
@@ -60,6 +72,8 @@  struct vhost_virtqueue
 	volatile uint16_t	last_used_idx_res;	/* Used for multiple devices reserving buffers. */
 	eventfd_t			callfd;				/* Currently unused as polling mode is enabled. */
 	eventfd_t			kickfd;				/* Used to notify the guest (trigger interrupt). */
+	/* Used for scatter RX. */
+	struct buf_vector	buf_vec[BUF_VECTOR_MAX];
 } __rte_cache_aligned;
 
 /*