[dpdk-dev,v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598

Message ID 1440085070-13989-1-git-send-email-vladz@cloudius-systems.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Vladislav Zolotarov Aug. 20, 2015, 3:37 p.m. UTC
  According to 82599 and x540 HW specifications RS bit *must* be
set in the last descriptor of *every* packet.

Before this patch there were 3 types of Tx callbacks that were setting
RS bit every tx_rs_thresh descriptors. This patch introduces a set of
new callbacks, one for each type mentioned above, that will set the RS
bit in every EOP descriptor.

ixgbe_set_tx_function() will set the appropriate Tx callback according
to the device family.

This patch fixes the Tx hang we were constantly hitting with a
seastar-based application on x540 NIC.

Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
---
New in v4:
   - Styling (white spaces) fixes.

New in v3:
   - Enforce the RS bit setting instead of enforcing tx_rs_thresh to be 1.
---
 drivers/net/ixgbe/ixgbe_ethdev.c   |  14 +++-
 drivers/net/ixgbe/ixgbe_ethdev.h   |   4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c     | 139 ++++++++++++++++++++++++++++---------
 drivers/net/ixgbe/ixgbe_rxtx.h     |   2 +
 drivers/net/ixgbe/ixgbe_rxtx_vec.c |  29 ++++++--
 5 files changed, 149 insertions(+), 39 deletions(-)
  

Comments

Vladislav Zolotarov Aug. 24, 2015, 8:11 a.m. UTC | #1
On 08/20/15 18:37, Vlad Zolotarov wrote:
> According to 82599 and x540 HW specifications RS bit *must* be
> set in the last descriptor of *every* packet.
>
> Before this patch there were 3 types of Tx callbacks that were setting
> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> new callbacks, one for each type mentioned above, that will set the RS
> bit in every EOP descriptor.
>
> ixgbe_set_tx_function() will set the appropriate Tx callback according
> to the device family.

[+Jesse and Jeff]

I've started to look at the i40e PMD and it has the same RS bit 
deferring logic
as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a 
descriptor write-back
completion mode.

 From the HW Spec it's unclear if RS bit should be set on *every* descriptor
with EOP bit. However I noticed that Linux driver, before it moved to 
HEAD write-back mode, was setting RS
bit on every EOP descriptor.

So, here is a question to Intel guys: could u, pls., clarify this point?

Thanks in advance,
vlad

>
> This patch fixes the Tx hang we were constantly hitting with a
> seastar-based application on x540 NIC.
>
> Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
> ---
> New in v4:
>     - Styling (white spaces) fixes.
>
> New in v3:
>     - Enforce the RS bit setting instead of enforcing tx_rs_thresh to be 1.
> ---
>   drivers/net/ixgbe/ixgbe_ethdev.c   |  14 +++-
>   drivers/net/ixgbe/ixgbe_ethdev.h   |   4 ++
>   drivers/net/ixgbe/ixgbe_rxtx.c     | 139 ++++++++++++++++++++++++++++---------
>   drivers/net/ixgbe/ixgbe_rxtx.h     |   2 +
>   drivers/net/ixgbe/ixgbe_rxtx_vec.c |  29 ++++++--
>   5 files changed, 149 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
> index b8ee1e9..355882c 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -866,12 +866,17 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
>   	uint32_t ctrl_ext;
>   	uint16_t csum;
>   	int diag, i;
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
>   
>   	PMD_INIT_FUNC_TRACE();
>   
>   	eth_dev->dev_ops = &ixgbe_eth_dev_ops;
>   	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
> -	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +
> +	if (rs_deferring_allowed)
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +	else
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
>   
>   	/*
>   	 * For secondary processes, we don't initialise any further as primary
> @@ -1147,12 +1152,17 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
>   	struct ixgbe_hwstrip *hwstrip =
>   		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(eth_dev->data->dev_private);
>   	struct ether_addr *perm_addr = (struct ether_addr *) hw->mac.perm_addr;
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
>   
>   	PMD_INIT_FUNC_TRACE();
>   
>   	eth_dev->dev_ops = &ixgbevf_eth_dev_ops;
>   	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
> -	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +
> +	if (rs_deferring_allowed)
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
> +	else
> +		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
>   
>   	/* for secondary processes, we don't initialise any further as primary
>   	 * has already done this work. Only check we don't need a different
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
> index c3d4f4f..390356d 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.h
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.h
> @@ -367,9 +367,13 @@ uint16_t ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue,
>   
>   uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   
>   uint16_t ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_simple_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   
>   int ixgbe_dev_rss_hash_update(struct rte_eth_dev *dev,
>   			      struct rte_eth_rss_conf *rss_conf);
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 91023b9..044f72c 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -164,11 +164,16 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
>   
>   /* Populate 4 descriptors with data from 4 mbufs */
>   static inline void
> -tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
> +tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
> +    bool always_rs)
>   {
>   	uint64_t buf_dma_addr;
>   	uint32_t pkt_len;
>   	int i;
> +	uint32_t flags = DCMD_DTYP_FLAGS;
> +
> +	if (always_rs)
> +		flags |= IXGBE_ADVTXD_DCMD_RS;
>   
>   	for (i = 0; i < 4; ++i, ++txdp, ++pkts) {
>   		buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
> @@ -178,7 +183,7 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   		txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
>   
>   		txdp->read.cmd_type_len =
> -			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
> +			rte_cpu_to_le_32(flags | pkt_len);
>   
>   		txdp->read.olinfo_status =
>   			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
> @@ -189,10 +194,15 @@ tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   
>   /* Populate 1 descriptor with data from 1 mbuf */
>   static inline void
> -tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
> +tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
> +    bool always_rs)
>   {
>   	uint64_t buf_dma_addr;
>   	uint32_t pkt_len;
> +	uint32_t flags = DCMD_DTYP_FLAGS;
> +
> +	if (always_rs)
> +		flags |= IXGBE_ADVTXD_DCMD_RS;
>   
>   	buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
>   	pkt_len = (*pkts)->data_len;
> @@ -200,7 +210,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>   	/* write data to descriptor */
>   	txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
>   	txdp->read.cmd_type_len =
> -			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
> +			rte_cpu_to_le_32(flags | pkt_len);
>   	txdp->read.olinfo_status =
>   			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
>   	rte_prefetch0(&(*pkts)->pool);
> @@ -212,7 +222,7 @@ tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
>    */
>   static inline void
>   ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
> -		      uint16_t nb_pkts)
> +		      uint16_t nb_pkts, bool always_rs)
>   {
>   	volatile union ixgbe_adv_tx_desc *txdp = &(txq->tx_ring[txq->tx_tail]);
>   	struct ixgbe_tx_entry *txep = &(txq->sw_ring[txq->tx_tail]);
> @@ -232,20 +242,21 @@ ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
>   		for (j = 0; j < N_PER_LOOP; ++j) {
>   			(txep + i + j)->mbuf = *(pkts + i + j);
>   		}
> -		tx4(txdp + i, pkts + i);
> +		tx4(txdp + i, pkts + i, always_rs);
>   	}
>   
>   	if (unlikely(leftover > 0)) {
>   		for (i = 0; i < leftover; ++i) {
>   			(txep + mainpart + i)->mbuf = *(pkts + mainpart + i);
> -			tx1(txdp + mainpart + i, pkts + mainpart + i);
> +			tx1(txdp + mainpart + i, pkts + mainpart + i,
> +			    always_rs);
>   		}
>   	}
>   }
>   
>   static inline uint16_t
>   tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> -	     uint16_t nb_pkts)
> +	     uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
>   	volatile union ixgbe_adv_tx_desc *tx_r = txq->tx_ring;
> @@ -281,22 +292,25 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	 */
>   	if ((txq->tx_tail + nb_pkts) > txq->nb_tx_desc) {
>   		n = (uint16_t)(txq->nb_tx_desc - txq->tx_tail);
> -		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n);
> +		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n, always_rs);
>   
> -		/*
> -		 * We know that the last descriptor in the ring will need to
> -		 * have its RS bit set because tx_rs_thresh has to be
> -		 * a divisor of the ring size
> -		 */
> -		tx_r[txq->tx_next_rs].read.cmd_type_len |=
> -			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
> -		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		if (!always_rs) {
> +			/*
> +			 * We know that the last descriptor in the ring will
> +			 * need to have its RS bit set because tx_rs_thresh has
> +			 * to be a divisor of the ring size
> +			 */
> +			tx_r[txq->tx_next_rs].read.cmd_type_len |=
> +				rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
> +			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		}
>   
>   		txq->tx_tail = 0;
>   	}
>   
>   	/* Fill H/W descriptor ring with mbuf data */
> -	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n));
> +	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n),
> +			      always_rs);
>   	txq->tx_tail = (uint16_t)(txq->tx_tail + (nb_pkts - n));
>   
>   	/*
> @@ -306,7 +320,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	 * but instead of subtracting 1 and doing >=, we can just do
>   	 * greater than without subtracting.
>   	 */
> -	if (txq->tx_tail > txq->tx_next_rs) {
> +	if (!always_rs && txq->tx_tail > txq->tx_next_rs) {
>   		tx_r[txq->tx_next_rs].read.cmd_type_len |=
>   			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
>   		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
> @@ -329,22 +343,23 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_pkts;
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		       uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			uint16_t nb_pkts, bool always_rs)
>   {
>   	uint16_t nb_tx;
>   
>   	/* Try to transmit at least chunks of TX_MAX_BURST pkts */
>   	if (likely(nb_pkts <= RTE_PMD_IXGBE_TX_MAX_BURST))
> -		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts);
> +		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts, always_rs);
>   
>   	/* transmit more than the max burst, in chunks of TX_MAX_BURST */
>   	nb_tx = 0;
>   	while (nb_pkts) {
>   		uint16_t ret, n;
>   		n = (uint16_t)RTE_MIN(nb_pkts, RTE_PMD_IXGBE_TX_MAX_BURST);
> -		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n);
> +		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n, always_rs);
>   		nb_tx = (uint16_t)(nb_tx + ret);
>   		nb_pkts = (uint16_t)(nb_pkts - ret);
>   		if (ret < n)
> @@ -354,6 +369,20 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_tx;
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		       uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_simple_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +				 uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   static inline void
>   ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
>   		volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
> @@ -565,9 +594,10 @@ ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
>   	return (0);
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		 uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq;
>   	struct ixgbe_tx_entry *sw_ring;
> @@ -827,11 +857,20 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		 * The last packet data descriptor needs End Of Packet (EOP)
>   		 */
>   		cmd_type_len |= IXGBE_TXD_CMD_EOP;
> -		txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
> +
> +		if (always_rs) {
> +			PMD_TX_FREE_LOG(DEBUG,
> +					"Setting RS bit on TXD id=%4u (port=%d queue=%d)",
> +					tx_last, txq->port_id, txq->queue_id);
> +			cmd_type_len |= IXGBE_TXD_CMD_RS;
> +		} else {
> +			txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
> +		}
> +
>   		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
>   
>   		/* Set RS bit only on threshold packets' last descriptor */
> -		if (txq->nb_tx_used >= txq->tx_rs_thresh) {
> +		if (!always_rs && txq->nb_tx_used >= txq->tx_rs_thresh) {
>   			PMD_TX_FREE_LOG(DEBUG,
>   					"Setting RS bit on TXD id="
>   					"%4u (port=%d queue=%d)",
> @@ -859,6 +898,20 @@ end_of_tx:
>   	return (nb_tx);
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			  uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   /*********************************************************************
>    *
>    *  RX functions
> @@ -2047,6 +2100,22 @@ static const struct ixgbe_txq_ops def_txq_ops = {
>   void __attribute__((cold))
>   ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   {
> +	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +
> +	/*
> +	 * According to 82599 and x540 specifications RS bit *must* be set on
> +	 * the last descriptor of *every* packet. Therefore we will not allow
> +	 * the tx_rs_thresh above 1 for all NICs newer than 82598. Since VFs are
> +	 * available only on devices starting from 82599, tx_rs_thresh should be
> +	 * set to 1 for ALL VF devices.
> +	 */
> +	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
> +
> +	if (rs_deferring_allowed)
> +		PMD_INIT_LOG(DEBUG, "Using an RS bit setting deferring callback");
> +	else
> +		PMD_INIT_LOG(DEBUG, "Using an always setting RS bit callback");
> +
>   	/* Use a simple Tx queue (no offloads, no multi segs) if possible */
>   	if (((txq->txq_flags & IXGBE_SIMPLE_FLAGS) == IXGBE_SIMPLE_FLAGS)
>   			&& (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)) {
> @@ -2056,10 +2125,14 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   				(rte_eal_process_type() != RTE_PROC_PRIMARY ||
>   					ixgbe_txq_vec_setup(txq) == 0)) {
>   			PMD_INIT_LOG(DEBUG, "Vector tx enabled.");
> -			dev->tx_pkt_burst = ixgbe_xmit_pkts_vec;
> +			dev->tx_pkt_burst = (rs_deferring_allowed ?
> +					     ixgbe_xmit_pkts_vec :
> +					     ixgbe_xmit_pkts_vec_always_rs);
>   		} else
>   #endif
> -		dev->tx_pkt_burst = ixgbe_xmit_pkts_simple;
> +		dev->tx_pkt_burst = (rs_deferring_allowed ?
> +				     ixgbe_xmit_pkts_simple :
> +				     ixgbe_xmit_pkts_simple_always_rs);
>   	} else {
>   		PMD_INIT_LOG(DEBUG, "Using full-featured tx code path");
>   		PMD_INIT_LOG(DEBUG,
> @@ -2070,7 +2143,9 @@ ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
>   				" - tx_rs_thresh = %lu " "[RTE_PMD_IXGBE_TX_MAX_BURST=%lu]",
>   				(unsigned long)txq->tx_rs_thresh,
>   				(unsigned long)RTE_PMD_IXGBE_TX_MAX_BURST);
> -		dev->tx_pkt_burst = ixgbe_xmit_pkts;
> +		dev->tx_pkt_burst = (rs_deferring_allowed ?
> +				     ixgbe_xmit_pkts :
> +				     ixgbe_xmit_pkts_always_rs);
>   	}
>   }
>   
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 113682a..c7ed71b 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -285,6 +285,8 @@ void ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq);
>   
>   uint16_t ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		uint16_t nb_pkts);
> +uint16_t ixgbe_xmit_pkts_vec_always_rs(
> +	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
>   int ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq);
>   
>   #endif /* RTE_IXGBE_INC_VECTOR */
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> index cf25a53..a680fc2 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> @@ -723,9 +723,10 @@ tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
>   		txep[i].mbuf = tx_pkts[i];
>   }
>   
> -uint16_t
> -ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> -		       uint16_t nb_pkts)
> +/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
> +static inline uint16_t
> +_ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		     uint16_t nb_pkts, bool always_rs)
>   {
>   	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
>   	volatile union ixgbe_adv_tx_desc *txdp;
> @@ -735,6 +736,9 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
>   	int i;
>   
> +	if (always_rs)
> +		flags = rs;
> +
>   	if (unlikely(nb_pkts > RTE_IXGBE_VPMD_TX_BURST))
>   		nb_pkts = RTE_IXGBE_VPMD_TX_BURST;
>   
> @@ -764,7 +768,8 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   		nb_commit = (uint16_t)(nb_commit - n);
>   
>   		tx_id = 0;
> -		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
> +		if (!always_rs)
> +			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
>   
>   		/* avoid reach the end of ring */
>   		txdp = &(txq->tx_ring[tx_id]);
> @@ -776,7 +781,7 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	vtx(txdp, tx_pkts, nb_commit, flags);
>   
>   	tx_id = (uint16_t)(tx_id + nb_commit);
> -	if (tx_id > txq->tx_next_rs) {
> +	if (!always_rs && tx_id > txq->tx_next_rs) {
>   		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
>   			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
>   		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
> @@ -790,6 +795,20 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
>   	return nb_pkts;
>   }
>   
> +uint16_t
> +ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		    uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, false);
> +}
> +
> +uint16_t
> +ixgbe_xmit_pkts_vec_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
> +			      uint16_t nb_pkts)
> +{
> +	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, true);
> +}
> +
>   static void __attribute__((cold))
>   ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
>   {
  
Thomas Monjalon Oct. 27, 2015, 6:09 p.m. UTC | #2
Any Follow-up to this discussion?
Should we mark this patch as rejected?

2015-08-24 11:11, Vlad Zolotarov:
> On 08/20/15 18:37, Vlad Zolotarov wrote:
> > According to 82599 and x540 HW specifications RS bit *must* be
> > set in the last descriptor of *every* packet.
> >
> > Before this patch there were 3 types of Tx callbacks that were setting
> > RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> > new callbacks, one for each type mentioned above, that will set the RS
> > bit in every EOP descriptor.
> >
> > ixgbe_set_tx_function() will set the appropriate Tx callback according
> > to the device family.
> 
> [+Jesse and Jeff]
> 
> I've started to look at the i40e PMD and it has the same RS bit 
> deferring logic
> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a 
> descriptor write-back
> completion mode.
> 
>  From the HW Spec it's unclear if RS bit should be set on *every* descriptor
> with EOP bit. However I noticed that Linux driver, before it moved to 
> HEAD write-back mode, was setting RS
> bit on every EOP descriptor.
> 
> So, here is a question to Intel guys: could u, pls., clarify this point?
> 
> Thanks in advance,
> vlad
  
Vladislav Zolotarov Oct. 27, 2015, 6:47 p.m. UTC | #3
On 10/27/15 20:09, Thomas Monjalon wrote:
> Any Follow-up to this discussion?
> Should we mark this patch as rejected?

Hmmm... This patch fixes an obvious spec violation. Why would it be 
rejected?

>
> 2015-08-24 11:11, Vlad Zolotarov:
>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>> According to 82599 and x540 HW specifications RS bit *must* be
>>> set in the last descriptor of *every* packet.
>>>
>>> Before this patch there were 3 types of Tx callbacks that were setting
>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>> new callbacks, one for each type mentioned above, that will set the RS
>>> bit in every EOP descriptor.
>>>
>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>> to the device family.
>> [+Jesse and Jeff]
>>
>> I've started to look at the i40e PMD and it has the same RS bit
>> deferring logic
>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>> descriptor write-back
>> completion mode.
>>
>>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>> with EOP bit. However I noticed that Linux driver, before it moved to
>> HEAD write-back mode, was setting RS
>> bit on every EOP descriptor.
>>
>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>
>> Thanks in advance,
>> vlad
>
>
  
Jesse Brandeburg Oct. 27, 2015, 6:50 p.m. UTC | #4
+ixgbe developers.

-----Original Message-----
From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com] 
Sent: Tuesday, October 27, 2015 11:48 AM
To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598



On 10/27/15 20:09, Thomas Monjalon wrote:
> Any Follow-up to this discussion?
> Should we mark this patch as rejected?

Hmmm... This patch fixes an obvious spec violation. Why would it be 
rejected?

>
> 2015-08-24 11:11, Vlad Zolotarov:
>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>> According to 82599 and x540 HW specifications RS bit *must* be
>>> set in the last descriptor of *every* packet.
>>>
>>> Before this patch there were 3 types of Tx callbacks that were setting
>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>> new callbacks, one for each type mentioned above, that will set the RS
>>> bit in every EOP descriptor.
>>>
>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>> to the device family.
>> [+Jesse and Jeff]
>>
>> I've started to look at the i40e PMD and it has the same RS bit
>> deferring logic
>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>> descriptor write-back
>> completion mode.
>>
>>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>> with EOP bit. However I noticed that Linux driver, before it moved to
>> HEAD write-back mode, was setting RS
>> bit on every EOP descriptor.
>>
>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>
>> Thanks in advance,
>> vlad
>
>
  
Ananyev, Konstantin Oct. 27, 2015, 7:10 p.m. UTC | #5
Hi lads,

> -----Original Message-----
> From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com]
> Sent: Tuesday, October 27, 2015 6:48 PM
> To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
> Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
> Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
> 
> 
> 
> On 10/27/15 20:09, Thomas Monjalon wrote:
> > Any Follow-up to this discussion?
> > Should we mark this patch as rejected?
> 
> Hmmm... This patch fixes an obvious spec violation. Why would it be
> rejected?

No I don't think we can reject the patch:
There is a reproducible  TX hang on ixgbe PMD on described conditions.
Though, as I explained here:
http://dpdk.org/ml/archives/dev/2015-September/023574.html
Vlad's patch would cause quite a big slowdown.
We are still in the process to get an answer from HW guys are there any
alternatives that will allow to fix the problem and avoid the slowdown.
Konstantin  

> 
> >
> > 2015-08-24 11:11, Vlad Zolotarov:
> >> On 08/20/15 18:37, Vlad Zolotarov wrote:
> >>> According to 82599 and x540 HW specifications RS bit *must* be
> >>> set in the last descriptor of *every* packet.
> >>>
> >>> Before this patch there were 3 types of Tx callbacks that were setting
> >>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
> >>> new callbacks, one for each type mentioned above, that will set the RS
> >>> bit in every EOP descriptor.
> >>>
> >>> ixgbe_set_tx_function() will set the appropriate Tx callback according
> >>> to the device family.
> >> [+Jesse and Jeff]
> >>
> >> I've started to look at the i40e PMD and it has the same RS bit
> >> deferring logic
> >> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
> >> descriptor write-back
> >> completion mode.
> >>
> >>   From the HW Spec it's unclear if RS bit should be set on *every* descriptor
> >> with EOP bit. However I noticed that Linux driver, before it moved to
> >> HEAD write-back mode, was setting RS
> >> bit on every EOP descriptor.
> >>
> >> So, here is a question to Intel guys: could u, pls., clarify this point?
> >>
> >> Thanks in advance,
> >> vlad
> >
> >
  
Vladislav Zolotarov Oct. 27, 2015, 7:14 p.m. UTC | #6
On 10/27/15 21:10, Ananyev, Konstantin wrote:
> Hi lads,
>
>> -----Original Message-----
>> From: Vlad Zolotarov [mailto:vladz@cloudius-systems.com]
>> Sent: Tuesday, October 27, 2015 6:48 PM
>> To: Thomas Monjalon; Ananyev, Konstantin; Zhang, Helin
>> Cc: dev@dpdk.org; Kirsher, Jeffrey T; Brandeburg, Jesse
>> Subject: Re: [dpdk-dev] [PATCH v4] ixgbe_pmd: enforce RS bit on every EOP descriptor for devices newer than 82598
>>
>>
>>
>> On 10/27/15 20:09, Thomas Monjalon wrote:
>>> Any Follow-up to this discussion?
>>> Should we mark this patch as rejected?
>> Hmmm... This patch fixes an obvious spec violation. Why would it be
>> rejected?
> No I don't think we can reject the patch:
> There is a reproducible  TX hang on ixgbe PMD on described conditions.
> Though, as I explained here:
> http://dpdk.org/ml/archives/dev/2015-September/023574.html
> Vlad's patch would cause quite a big slowdown.
> We are still in the process to get an answer from HW guys are there any
> alternatives that will allow to fix the problem and avoid the slowdown.

+1

> Konstantin
>
>>> 2015-08-24 11:11, Vlad Zolotarov:
>>>> On 08/20/15 18:37, Vlad Zolotarov wrote:
>>>>> According to 82599 and x540 HW specifications RS bit *must* be
>>>>> set in the last descriptor of *every* packet.
>>>>>
>>>>> Before this patch there were 3 types of Tx callbacks that were setting
>>>>> RS bit every tx_rs_thresh descriptors. This patch introduces a set of
>>>>> new callbacks, one for each type mentioned above, that will set the RS
>>>>> bit in every EOP descriptor.
>>>>>
>>>>> ixgbe_set_tx_function() will set the appropriate Tx callback according
>>>>> to the device family.
>>>> [+Jesse and Jeff]
>>>>
>>>> I've started to look at the i40e PMD and it has the same RS bit
>>>> deferring logic
>>>> as ixgbe PMD has (surprise, surprise!.. ;)). To recall, i40e PMD uses a
>>>> descriptor write-back
>>>> completion mode.
>>>>
>>>>    From the HW Spec it's unclear if RS bit should be set on *every* descriptor
>>>> with EOP bit. However I noticed that Linux driver, before it moved to
>>>> HEAD write-back mode, was setting RS
>>>> bit on every EOP descriptor.
>>>>
>>>> So, here is a question to Intel guys: could u, pls., clarify this point?
>>>>
>>>> Thanks in advance,
>>>> vlad
>>>
  
Ananyev, Konstantin Nov. 9, 2015, 7:21 p.m. UTC | #7
First patch contains changes in testpmd that allow to reproduce the issue.
Second patch is the actual fix.

Konstantin Ananyev (2):
  testpmd: add ability to split outgoing packets
  ixgbe: fix TX hang when RS distance exceeds HW limit

 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 app/test/test_pmd_perf.c                    |   8 +-
 doc/guides/rel_notes/release_2_2.rst        |   7 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
 10 files changed, 357 insertions(+), 18 deletions(-)
  
Ananyev, Konstantin Nov. 10, 2015, 1:48 p.m. UTC | #8
First patch contains changes in testpmd that allow to reproduce the issue.
Second patch is the actual fix.

Konstantin Ananyev (2):
  testpmd: add ability to split outgoing packets
  ixgbe: fix TX hang when RS distance exceeds HW limit

 app/test-pmd/cmdline.c                      |  57 +++++++++-
 app/test-pmd/config.c                       |  61 +++++++++++
 app/test-pmd/csumonly.c                     | 163 +++++++++++++++++++++++++++-
 app/test-pmd/testpmd.c                      |   3 +
 app/test-pmd/testpmd.h                      |  10 ++
 app/test-pmd/txonly.c                       |  13 ++-
 app/test/test_pmd_perf.c                    |   8 +-
 doc/guides/rel_notes/release_2_2.rst        |   7 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
 10 files changed, 357 insertions(+), 18 deletions(-)
  
De Lara Guarch, Pablo Nov. 10, 2015, 2:06 p.m. UTC | #9
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Konstantin
> Ananyev
> Sent: Tuesday, November 10, 2015 1:48 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCHv7 0/2] ixgbe: fix TX hang when RS distance
> exceeds HW limit
> 
> First patch contains changes in testpmd that allow to reproduce the issue.
> Second patch is the actual fix.
> 
> Konstantin Ananyev (2):
>   testpmd: add ability to split outgoing packets
>   ixgbe: fix TX hang when RS distance exceeds HW limit
> 
>  app/test-pmd/cmdline.c                      |  57 +++++++++-
>  app/test-pmd/config.c                       |  61 +++++++++++
>  app/test-pmd/csumonly.c                     | 163
> +++++++++++++++++++++++++++-
>  app/test-pmd/testpmd.c                      |   3 +
>  app/test-pmd/testpmd.h                      |  10 ++
>  app/test-pmd/txonly.c                       |  13 ++-
>  app/test/test_pmd_perf.c                    |   8 +-
>  doc/guides/rel_notes/release_2_2.rst        |   7 ++
>  doc/guides/testpmd_app_ug/testpmd_funcs.rst |  21 +++-
>  drivers/net/ixgbe/ixgbe_rxtx.c              |  32 +++++-
>  10 files changed, 357 insertions(+), 18 deletions(-)
> 
> --
> 1.8.3.1

Series-acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
  
Thomas Monjalon Nov. 11, 2015, 11:15 p.m. UTC | #10
> > First patch contains changes in testpmd that allow to reproduce the issue.
> > Second patch is the actual fix.
> > 
> > Konstantin Ananyev (2):
> >   testpmd: add ability to split outgoing packets
> >   ixgbe: fix TX hang when RS distance exceeds HW limit
> 
> Series-acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>

Applied, thanks
  

Patch

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index b8ee1e9..355882c 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -866,12 +866,17 @@  eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
 	uint32_t ctrl_ext;
 	uint16_t csum;
 	int diag, i;
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
 
 	PMD_INIT_FUNC_TRACE();
 
 	eth_dev->dev_ops = &ixgbe_eth_dev_ops;
 	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
-	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+
+	if (rs_deferring_allowed)
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
 
 	/*
 	 * For secondary processes, we don't initialise any further as primary
@@ -1147,12 +1152,17 @@  eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
 	struct ixgbe_hwstrip *hwstrip =
 		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(eth_dev->data->dev_private);
 	struct ether_addr *perm_addr = (struct ether_addr *) hw->mac.perm_addr;
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
 
 	PMD_INIT_FUNC_TRACE();
 
 	eth_dev->dev_ops = &ixgbevf_eth_dev_ops;
 	eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
-	eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+
+	if (rs_deferring_allowed)
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
+	else
+		eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts_always_rs;
 
 	/* for secondary processes, we don't initialise any further as primary
 	 * has already done this work. Only check we don't need a different
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index c3d4f4f..390356d 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -367,9 +367,13 @@  uint16_t ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue,
 
 uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 
 uint16_t ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_simple_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 
 int ixgbe_dev_rss_hash_update(struct rte_eth_dev *dev,
 			      struct rte_eth_rss_conf *rss_conf);
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 91023b9..044f72c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -164,11 +164,16 @@  ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
 
 /* Populate 4 descriptors with data from 4 mbufs */
 static inline void
-tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
+tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
+    bool always_rs)
 {
 	uint64_t buf_dma_addr;
 	uint32_t pkt_len;
 	int i;
+	uint32_t flags = DCMD_DTYP_FLAGS;
+
+	if (always_rs)
+		flags |= IXGBE_ADVTXD_DCMD_RS;
 
 	for (i = 0; i < 4; ++i, ++txdp, ++pkts) {
 		buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
@@ -178,7 +183,7 @@  tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 		txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
 
 		txdp->read.cmd_type_len =
-			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
+			rte_cpu_to_le_32(flags | pkt_len);
 
 		txdp->read.olinfo_status =
 			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
@@ -189,10 +194,15 @@  tx4(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 
 /* Populate 1 descriptor with data from 1 mbuf */
 static inline void
-tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
+tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts,
+    bool always_rs)
 {
 	uint64_t buf_dma_addr;
 	uint32_t pkt_len;
+	uint32_t flags = DCMD_DTYP_FLAGS;
+
+	if (always_rs)
+		flags |= IXGBE_ADVTXD_DCMD_RS;
 
 	buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(*pkts);
 	pkt_len = (*pkts)->data_len;
@@ -200,7 +210,7 @@  tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
 	/* write data to descriptor */
 	txdp->read.buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
 	txdp->read.cmd_type_len =
-			rte_cpu_to_le_32((uint32_t)DCMD_DTYP_FLAGS | pkt_len);
+			rte_cpu_to_le_32(flags | pkt_len);
 	txdp->read.olinfo_status =
 			rte_cpu_to_le_32(pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
 	rte_prefetch0(&(*pkts)->pool);
@@ -212,7 +222,7 @@  tx1(volatile union ixgbe_adv_tx_desc *txdp, struct rte_mbuf **pkts)
  */
 static inline void
 ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
-		      uint16_t nb_pkts)
+		      uint16_t nb_pkts, bool always_rs)
 {
 	volatile union ixgbe_adv_tx_desc *txdp = &(txq->tx_ring[txq->tx_tail]);
 	struct ixgbe_tx_entry *txep = &(txq->sw_ring[txq->tx_tail]);
@@ -232,20 +242,21 @@  ixgbe_tx_fill_hw_ring(struct ixgbe_tx_queue *txq, struct rte_mbuf **pkts,
 		for (j = 0; j < N_PER_LOOP; ++j) {
 			(txep + i + j)->mbuf = *(pkts + i + j);
 		}
-		tx4(txdp + i, pkts + i);
+		tx4(txdp + i, pkts + i, always_rs);
 	}
 
 	if (unlikely(leftover > 0)) {
 		for (i = 0; i < leftover; ++i) {
 			(txep + mainpart + i)->mbuf = *(pkts + mainpart + i);
-			tx1(txdp + mainpart + i, pkts + mainpart + i);
+			tx1(txdp + mainpart + i, pkts + mainpart + i,
+			    always_rs);
 		}
 	}
 }
 
 static inline uint16_t
 tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
-	     uint16_t nb_pkts)
+	     uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
 	volatile union ixgbe_adv_tx_desc *tx_r = txq->tx_ring;
@@ -281,22 +292,25 @@  tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	 */
 	if ((txq->tx_tail + nb_pkts) > txq->nb_tx_desc) {
 		n = (uint16_t)(txq->nb_tx_desc - txq->tx_tail);
-		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n);
+		ixgbe_tx_fill_hw_ring(txq, tx_pkts, n, always_rs);
 
-		/*
-		 * We know that the last descriptor in the ring will need to
-		 * have its RS bit set because tx_rs_thresh has to be
-		 * a divisor of the ring size
-		 */
-		tx_r[txq->tx_next_rs].read.cmd_type_len |=
-			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
-		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		if (!always_rs) {
+			/*
+			 * We know that the last descriptor in the ring will
+			 * need to have its RS bit set because tx_rs_thresh has
+			 * to be a divisor of the ring size
+			 */
+			tx_r[txq->tx_next_rs].read.cmd_type_len |=
+				rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
+			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		}
 
 		txq->tx_tail = 0;
 	}
 
 	/* Fill H/W descriptor ring with mbuf data */
-	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n));
+	ixgbe_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n),
+			      always_rs);
 	txq->tx_tail = (uint16_t)(txq->tx_tail + (nb_pkts - n));
 
 	/*
@@ -306,7 +320,7 @@  tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	 * but instead of subtracting 1 and doing >=, we can just do
 	 * greater than without subtracting.
 	 */
-	if (txq->tx_tail > txq->tx_next_rs) {
+	if (!always_rs && txq->tx_tail > txq->tx_next_rs) {
 		tx_r[txq->tx_next_rs].read.cmd_type_len |=
 			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
 		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
@@ -329,22 +343,23 @@  tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_pkts;
 }
 
-uint16_t
-ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
-		       uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+			uint16_t nb_pkts, bool always_rs)
 {
 	uint16_t nb_tx;
 
 	/* Try to transmit at least chunks of TX_MAX_BURST pkts */
 	if (likely(nb_pkts <= RTE_PMD_IXGBE_TX_MAX_BURST))
-		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts);
+		return tx_xmit_pkts(tx_queue, tx_pkts, nb_pkts, always_rs);
 
 	/* transmit more than the max burst, in chunks of TX_MAX_BURST */
 	nb_tx = 0;
 	while (nb_pkts) {
 		uint16_t ret, n;
 		n = (uint16_t)RTE_MIN(nb_pkts, RTE_PMD_IXGBE_TX_MAX_BURST);
-		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n);
+		ret = tx_xmit_pkts(tx_queue, &(tx_pkts[nb_tx]), n, always_rs);
 		nb_tx = (uint16_t)(nb_tx + ret);
 		nb_pkts = (uint16_t)(nb_pkts - ret);
 		if (ret < n)
@@ -354,6 +369,20 @@  ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_tx;
 }
 
+uint16_t
+ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+		       uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_simple_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+				 uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_simple(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 static inline void
 ixgbe_set_xmit_ctx(struct ixgbe_tx_queue *txq,
 		volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
@@ -565,9 +594,10 @@  ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
 	return (0);
 }
 
-uint16_t
-ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
-		uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+		 uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq;
 	struct ixgbe_tx_entry *sw_ring;
@@ -827,11 +857,20 @@  ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 * The last packet data descriptor needs End Of Packet (EOP)
 		 */
 		cmd_type_len |= IXGBE_TXD_CMD_EOP;
-		txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
+
+		if (always_rs) {
+			PMD_TX_FREE_LOG(DEBUG,
+					"Setting RS bit on TXD id=%4u (port=%d queue=%d)",
+					tx_last, txq->port_id, txq->queue_id);
+			cmd_type_len |= IXGBE_TXD_CMD_RS;
+		} else {
+			txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
+		}
+
 		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
 
 		/* Set RS bit only on threshold packets' last descriptor */
-		if (txq->nb_tx_used >= txq->tx_rs_thresh) {
+		if (!always_rs && txq->nb_tx_used >= txq->tx_rs_thresh) {
 			PMD_TX_FREE_LOG(DEBUG,
 					"Setting RS bit on TXD id="
 					"%4u (port=%d queue=%d)",
@@ -859,6 +898,20 @@  end_of_tx:
 	return (nb_tx);
 }
 
+uint16_t
+ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 /*********************************************************************
  *
  *  RX functions
@@ -2047,6 +2100,22 @@  static const struct ixgbe_txq_ops def_txq_ops = {
 void __attribute__((cold))
 ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 {
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	/*
+	 * According to 82599 and x540 specifications RS bit *must* be set on
+	 * the last descriptor of *every* packet. Therefore we will not allow
+	 * the tx_rs_thresh above 1 for all NICs newer than 82598. Since VFs are
+	 * available only on devices starting from 82599, tx_rs_thresh should be
+	 * set to 1 for ALL VF devices.
+	 */
+	bool rs_deferring_allowed = (hw->mac.type <= ixgbe_mac_82598EB);
+
+	if (rs_deferring_allowed)
+		PMD_INIT_LOG(DEBUG, "Using an RS bit setting deferring callback");
+	else
+		PMD_INIT_LOG(DEBUG, "Using an always setting RS bit callback");
+
 	/* Use a simple Tx queue (no offloads, no multi segs) if possible */
 	if (((txq->txq_flags & IXGBE_SIMPLE_FLAGS) == IXGBE_SIMPLE_FLAGS)
 			&& (txq->tx_rs_thresh >= RTE_PMD_IXGBE_TX_MAX_BURST)) {
@@ -2056,10 +2125,14 @@  ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 				(rte_eal_process_type() != RTE_PROC_PRIMARY ||
 					ixgbe_txq_vec_setup(txq) == 0)) {
 			PMD_INIT_LOG(DEBUG, "Vector tx enabled.");
-			dev->tx_pkt_burst = ixgbe_xmit_pkts_vec;
+			dev->tx_pkt_burst = (rs_deferring_allowed ?
+					     ixgbe_xmit_pkts_vec :
+					     ixgbe_xmit_pkts_vec_always_rs);
 		} else
 #endif
-		dev->tx_pkt_burst = ixgbe_xmit_pkts_simple;
+		dev->tx_pkt_burst = (rs_deferring_allowed ?
+				     ixgbe_xmit_pkts_simple :
+				     ixgbe_xmit_pkts_simple_always_rs);
 	} else {
 		PMD_INIT_LOG(DEBUG, "Using full-featured tx code path");
 		PMD_INIT_LOG(DEBUG,
@@ -2070,7 +2143,9 @@  ixgbe_set_tx_function(struct rte_eth_dev *dev, struct ixgbe_tx_queue *txq)
 				" - tx_rs_thresh = %lu " "[RTE_PMD_IXGBE_TX_MAX_BURST=%lu]",
 				(unsigned long)txq->tx_rs_thresh,
 				(unsigned long)RTE_PMD_IXGBE_TX_MAX_BURST);
-		dev->tx_pkt_burst = ixgbe_xmit_pkts;
+		dev->tx_pkt_burst = (rs_deferring_allowed ?
+				     ixgbe_xmit_pkts :
+				     ixgbe_xmit_pkts_always_rs);
 	}
 }
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 113682a..c7ed71b 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -285,6 +285,8 @@  void ixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq);
 
 uint16_t ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
+uint16_t ixgbe_xmit_pkts_vec_always_rs(
+	void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 int ixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq);
 
 #endif /* RTE_IXGBE_INC_VECTOR */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index cf25a53..a680fc2 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -723,9 +723,10 @@  tx_backlog_entry(struct ixgbe_tx_entry_v *txep,
 		txep[i].mbuf = tx_pkts[i];
 }
 
-uint16_t
-ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-		       uint16_t nb_pkts)
+/* if always_rs is TRUE set RS bit on every descriptor with EOP bit */
+static inline uint16_t
+_ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		     uint16_t nb_pkts, bool always_rs)
 {
 	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
 	volatile union ixgbe_adv_tx_desc *txdp;
@@ -735,6 +736,9 @@  ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
 	int i;
 
+	if (always_rs)
+		flags = rs;
+
 	if (unlikely(nb_pkts > RTE_IXGBE_VPMD_TX_BURST))
 		nb_pkts = RTE_IXGBE_VPMD_TX_BURST;
 
@@ -764,7 +768,8 @@  ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 		nb_commit = (uint16_t)(nb_commit - n);
 
 		tx_id = 0;
-		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+		if (!always_rs)
+			txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 
 		/* avoid reach the end of ring */
 		txdp = &(txq->tx_ring[tx_id]);
@@ -776,7 +781,7 @@  ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	vtx(txdp, tx_pkts, nb_commit, flags);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
-	if (tx_id > txq->tx_next_rs) {
+	if (!always_rs && tx_id > txq->tx_next_rs) {
 		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
 			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
 		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
@@ -790,6 +795,20 @@  ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_pkts;
 }
 
+uint16_t
+ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+		    uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, false);
+}
+
+uint16_t
+ixgbe_xmit_pkts_vec_always_rs(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts)
+{
+	return _ixgbe_xmit_pkts_vec(tx_queue, tx_pkts, nb_pkts, true);
+}
+
 static void __attribute__((cold))
 ixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
 {