[dpdk-dev,v2,3/5] igb: enable rx queue interrupts for PF

Message ID 1422951511-28143-4-git-send-email-danny.zhou@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Zhou, Danny Feb. 3, 2015, 8:18 a.m. UTC
v2 changes
- Consolidate review comments related to coding style

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
 lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
 lib/librte_pmd_e1000/igb_ethdev.c     | 230 ++++++++++++++++++++++++++++++----
 3 files changed, 214 insertions(+), 25 deletions(-)
  

Comments

Cunming Liang Feb. 12, 2015, 10 a.m. UTC | #1
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou Danny
> Sent: Tuesday, February 03, 2015 4:18 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> 
> v2 changes
> - Consolidate review comments related to coding style
> 
> The patch does below for igb PF:
> - Setup NIC to generate MSI-X interrupts
> - Set the IVAR register to map interrupt causes to vectors
> - Implement interrupt enable/disable functions
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Tested-by: Yong Liu <yong.liu@intel.com>
> ---
>  lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
>  lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
>  lib/librte_pmd_e1000/igb_ethdev.c     | 230
> ++++++++++++++++++++++++++++++----
>  3 files changed, 214 insertions(+), 25 deletions(-)
> 
> diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> index 4dd92a3..9b999ec 100644
> --- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> +++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> @@ -780,6 +780,9 @@ struct e1000_mac_info {
>  	u16 mta_reg_count;
>  	u16 uta_reg_count;
> 
> +	u32 max_rx_queues;
> +	u32 max_tx_queues;
> +
[LCM]  It can be avoid to define new things in share code.
The max_rx/tx_queues in mac info only used by eth_igb_configure_msix_intr().
It the input param won't be limit to 'hw'. If using rte_eth_dev as input, you can get all the info from dev_info.
The risk in share code is, on next time code merging, it won't care the change here do.

>  	/* Maximum size of the MTA register table in all supported adapters */
>  	#define MAX_MTA_REG 128
>  	u32 mta_shadow[MAX_MTA_REG];
> diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h
> b/lib/librte_pmd_e1000/e1000_ethdev.h
> index d155e77..713ca11 100644
> --- a/lib/librte_pmd_e1000/e1000_ethdev.h
> +++ b/lib/librte_pmd_e1000/e1000_ethdev.h
> @@ -34,6 +34,8 @@
>  #ifndef _E1000_ETHDEV_H_
>  #define _E1000_ETHDEV_H_
> 
> +#include <rte_spinlock.h>
> +
>  /* need update link, bit flag */
>  #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
>  #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
> @@ -105,10 +107,14 @@
>  #define E1000_FTQF_QUEUE_SHIFT           16
>  #define E1000_FTQF_QUEUE_ENABLE          0x00000100
> 
> +/* maximum number of other interrupts besides Rx & Tx interrupts */
> +#define E1000_MAX_OTHER_INTR		1
> +
>  /* structure for interrupt relative data */
>  struct e1000_interrupt {
>  	uint32_t flags;
>  	uint32_t mask;
> +	rte_spinlock_t lock;
>  };
> 
>  /* local vfta copy */
> diff --git a/lib/librte_pmd_e1000/igb_ethdev.c
> b/lib/librte_pmd_e1000/igb_ethdev.c
> index 2a268b8..7d9b103 100644
> --- a/lib/librte_pmd_e1000/igb_ethdev.c
> +++ b/lib/librte_pmd_e1000/igb_ethdev.c
> @@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
>  static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
>  				struct rte_eth_fc_conf *fc_conf);
>  static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
> +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
>  static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
>  static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
>  static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
> @@ -191,6 +192,14 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
>  		     enum rte_filter_op filter_op,
>  		     void *arg);
> 
> +static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t
> queue_id);
> +static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t
> queue_id);
> +static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> +				uint8_t queue, uint8_t msix_vector);
> +static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
> +static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
> +				uint8_t index, uint8_t offset);
> +
>  /*
>   * Define VF Stats MACRO for Non "cleared on read" register
>   */
> @@ -250,6 +259,8 @@ static struct eth_dev_ops eth_igb_ops = {
>  	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
>  	.vlan_offload_set     = eth_igb_vlan_offload_set,
>  	.rx_queue_setup       = eth_igb_rx_queue_setup,
> +	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
> +	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
>  	.rx_queue_release     = eth_igb_rx_queue_release,
>  	.rx_queue_count       = eth_igb_rx_queue_count,
>  	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
> @@ -592,6 +603,16 @@ eth_igb_dev_init(__attribute__((unused)) struct
> eth_driver *eth_drv,
>  		     eth_dev->data->port_id, pci_dev->id.vendor_id,
>  		     pci_dev->id.device_id);
> 
> +	/* set max interrupt vfio request */
> +	struct rte_eth_dev_info dev_info;
> +
> +	memset(&dev_info, 0, sizeof(dev_info));
> +	eth_igb_infos_get(eth_dev, &dev_info);
> +
> +	hw->mac.max_rx_queues = dev_info.max_rx_queues;
> +
> +	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> E1000_MAX_OTHER_INTR;
> +
>  	rte_intr_callback_register(&(pci_dev->intr_handle),
>  		eth_igb_interrupt_handler, (void *)eth_dev);
> 
> @@ -754,7 +775,7 @@ eth_igb_start(struct rte_eth_dev *dev)
>  {
>  	struct e1000_hw *hw =
>  		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> -	int ret, i, mask;
> +	int ret, mask;
>  	uint32_t ctrl_ext;
> 
>  	PMD_INIT_FUNC_TRACE();
> @@ -794,6 +815,9 @@ eth_igb_start(struct rte_eth_dev *dev)
>  	/* configure PF module if SRIOV enabled */
>  	igb_pf_host_configure(dev);
> 
> +	/* confiugre msix for rx interrupt */
> +	eth_igb_configure_msix_intr(hw);
> +
>  	/* Configure for OS presence */
>  	igb_init_manageability(hw);
> 
> @@ -821,33 +845,9 @@ eth_igb_start(struct rte_eth_dev *dev)
>  		igb_vmdq_vlan_hw_filter_enable(dev);
>  	}
> 
> -	/*
> -	 * Configure the Interrupt Moderation register (EITR) with the maximum
> -	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
> -	 * spurious [DMA] memory updates of RX and TX ring descriptors.
> -	 *
> -	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
> -	 * spurious memory updates per second should be expected.
> -	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
> -	 *
> -	 * Because interrupts are not used at all, the MSI-X is not activated
> -	 * and interrupt moderation is controlled by EITR[0].
> -	 *
> -	 * Note that having [almost] disabled memory updates of RX and TX ring
> -	 * descriptors through the Interrupt Moderation mechanism, memory
> -	 * updates of ring descriptors are now moderated by the configurable
> -	 * value of Write-Back Threshold registers.
> -	 */
>  	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)
> ||
>  		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210)
> ||
>  		(hw->mac.type == e1000_i211)) {
> -		uint32_t ivar;
> -
> -		/* Enable all RX & TX queues in the IVAR registers */
> -		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) |
> E1000_IVAR_VALID);
> -		for (i = 0; i < 8; i++)
> -			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
> -
>  		/* Configure EITR with the maximum possible value (0xFFFF) */
>  		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
>  	}
> @@ -901,6 +901,10 @@ eth_igb_start(struct rte_eth_dev *dev)
>  	if (dev->data->dev_conf.intr_conf.lsc != 0)
>  		ret = eth_igb_lsc_interrupt_setup(dev);
> 
> +	/* check if rxq interrupt is enabled */
> +	if (dev->data->dev_conf.intr_conf.rxq != 0)
> +		eth_igb_rxq_interrupt_setup(dev);
> +
>  	/* resume enabled intr since hw reset */
>  	igb_intr_enable(dev);
> 
> @@ -1791,6 +1795,35 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
>  		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> 
>  	intr->mask |= E1000_ICR_LSC;
> +	rte_spinlock_init(&(intr->lock));
> +
> +	return 0;
> +}
> +
> +/*
> + * It clears the interrupt causes and enables the interrupt.
> + * It will be called once only during nic initialized.
> + *
> + * @param dev
> + *  Pointer to struct rte_eth_dev.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> +{
> +	uint32_t mask, regval;
> +	struct e1000_hw *hw =
> +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	struct rte_eth_dev_info dev_info;
> +
> +	memset(&dev_info, 0, sizeof(dev_info));
> +	eth_igb_infos_get(dev, &dev_info);
> +
> +	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
> +	regval = E1000_READ_REG(hw, E1000_EIMS);
> +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> 
>  	return 0;
>  }
> @@ -3256,5 +3289,152 @@ static struct rte_driver pmd_igbvf_drv = {
>  	.init = rte_igbvf_pmd_init,
>  };
> 
> +static int
> +eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
> +{
> +	struct e1000_hw *hw =
> +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	struct e1000_interrupt *intr =
> +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> +	uint32_t mask = 1 << queue_id;
> +
> +	rte_spinlock_lock(&(intr->lock));
> +	E1000_WRITE_REG(hw, E1000_EIMC, mask);
> +	E1000_WRITE_FLUSH(hw);
> +	rte_spinlock_unlock(&(intr->lock));
> +
> +	return 0;
> +}
> +
> +static int
> +eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
> +{
> +	struct e1000_hw *hw =
> +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> +	struct e1000_interrupt *intr =
> +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> +	uint32_t mask = 1 << queue_id;
> +	uint32_t regval;
> +
> +	rte_spinlock_lock(&(intr->lock));
> +	regval = E1000_READ_REG(hw, E1000_EIMS);
> +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> +	E1000_WRITE_FLUSH(hw);
> +	rte_spinlock_unlock(&(intr->lock));
> +
> +	return 0;
> +}
> +
> +static void
> +eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
> +			uint8_t index, uint8_t offset)
> +{
> +	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
> +
> +	/* clear bits */
> +	val &= ~((uint32_t)0xFF << offset);
> +
> +	/* write vector and valid bit */
> +	val |= (msix_vector | E1000_IVAR_VALID) << offset;
> +
> +	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
> +}
> +
> +static void
> +eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> +				 uint8_t queue, uint8_t msix_vector)
> +{
> +	uint32_t tmp = 0;
> +	if (hw->mac.type == e1000_82575) {
> +		if (direction == 0)
> +			tmp = E1000_EICR_RX_QUEUE0 << queue;
> +		else if (direction == 1)
> +			tmp = E1000_EICR_TX_QUEUE0 << queue;
> +		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
> +	} else if (hw->mac.type == e1000_82576) {
> +		if ((direction == 0) || (direction == 1))
> +			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
> +					((queue & 0x8) << 1) + 8 * direction);
> +	} else if ((hw->mac.type == e1000_82580) ||
> +			(hw->mac.type == e1000_i350) ||
> +			(hw->mac.type == e1000_i354) ||
> +			(hw->mac.type == e1000_i210) ||
> +			(hw->mac.type == e1000_i211)) {
> +		if ((direction == 0) || (direction == 1))
> +			eth_igb_write_ivar(hw, msix_vector,
> +					queue >> 1,
> +					((queue & 0x1) << 4) + 8 * direction);
> +	}
> +}
> +
> +/*
> + * Sets up the hardware to generate MSI-X interrupts properly
> + * @hw
> + *  board private structure
> + */
> +static void
> +eth_igb_configure_msix_intr(struct e1000_hw *hw)
> +{
> +	int queue_id;
> +	uint32_t tmpval, regval, intr_mask;
> +	uint32_t max_rx_queues = hw->mac.max_rx_queues;
> +
> +	/* set interrupt vector for other causes */
> +	if (hw->mac.type == e1000_82575) {
> +		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
> +		/* enable MSI-X PBA support */
> +		tmpval |= E1000_CTRL_EXT_PBA_CLR;
> +
> +		/* Auto-Mask interrupts upon ICR read */
> +		tmpval |= E1000_CTRL_EXT_EIAME;
> +		tmpval |= E1000_CTRL_EXT_IRCA;
> +
> +		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
> +
> +		/* enable msix_other interrupt */
> +		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0,
> E1000_EIMS_OTHER);
> +		regval = E1000_READ_REG(hw, E1000_EIAC);
> +		E1000_WRITE_REG(hw, E1000_EIAC, regval |
> E1000_EIMS_OTHER);
> +		regval = E1000_READ_REG(hw, E1000_EIAM);
> +		E1000_WRITE_REG(hw, E1000_EIMS, regval |
> E1000_EIMS_OTHER);
> +	} else if ((hw->mac.type == e1000_82576) ||
> +			(hw->mac.type == e1000_82580) ||
> +			(hw->mac.type == e1000_i350) ||
> +			(hw->mac.type == e1000_i354) ||
> +			(hw->mac.type == e1000_i210) ||
> +			(hw->mac.type == e1000_i211)) {
> +		/* turn on MSI-X capability first */
> +		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
> +							E1000_GPIE_PBA |
> E1000_GPIE_EIAME |
> +							E1000_GPIE_NSICR);
> +
> +		/* enable msix_other interrupt */
> +		intr_mask = 1 << max_rx_queues;
> +		regval = E1000_READ_REG(hw, E1000_EIAC);
> +		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> +		regval = E1000_READ_REG(hw, E1000_EIMS);
> +		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
> +		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
> +
> +		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
> +	}
> +
> +	/*
> +	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
> is asserted
> +	* this saves a register write for every interrupt
> +	*/
> +	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
> +	regval = E1000_READ_REG(hw, E1000_EIAC);
> +	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> +	regval = E1000_READ_REG(hw, E1000_EIAM);
> +	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
> +
> +	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
> +		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
> +
> +	E1000_WRITE_FLUSH(hw);
> +}
> +
> +
>  PMD_REGISTER_DRIVER(pmd_igb_drv);
>  PMD_REGISTER_DRIVER(pmd_igbvf_drv);
> --
> 1.8.1.4
  
Ananyev, Konstantin Feb. 12, 2015, 10:29 a.m. UTC | #2
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> Sent: Thursday, February 12, 2015 10:01 AM
> To: Zhou, Danny; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou Danny
> > Sent: Tuesday, February 03, 2015 4:18 PM
> > To: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> >
> > v2 changes
> > - Consolidate review comments related to coding style
> >
> > The patch does below for igb PF:
> > - Setup NIC to generate MSI-X interrupts
> > - Set the IVAR register to map interrupt causes to vectors
> > - Implement interrupt enable/disable functions
> >
> > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > Tested-by: Yong Liu <yong.liu@intel.com>
> > ---
> >  lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
> >  lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
> >  lib/librte_pmd_e1000/igb_ethdev.c     | 230
> > ++++++++++++++++++++++++++++++----
> >  3 files changed, 214 insertions(+), 25 deletions(-)
> >
> > diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > index 4dd92a3..9b999ec 100644
> > --- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > +++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > @@ -780,6 +780,9 @@ struct e1000_mac_info {
> >  	u16 mta_reg_count;
> >  	u16 uta_reg_count;
> >
> > +	u32 max_rx_queues;
> > +	u32 max_tx_queues;
> > +
> [LCM]  It can be avoid to define new things in share code.
> The max_rx/tx_queues in mac info only used by eth_igb_configure_msix_intr().
> It the input param won't be limit to 'hw'. If using rte_eth_dev as input, you can get all the info from dev_info.
> The risk in share code is, on next time code merging, it won't care the change here do.

+1

BTW, another question, do we really need a spin_lock here (and in ixgbe as I remember)?
Usually we trying to avoid locks inside PMD code, just marking API call as 'not thread safe'.
Konstantin

> 
> >  	/* Maximum size of the MTA register table in all supported adapters */
> >  	#define MAX_MTA_REG 128
> >  	u32 mta_shadow[MAX_MTA_REG];
> > diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h
> > b/lib/librte_pmd_e1000/e1000_ethdev.h
> > index d155e77..713ca11 100644
> > --- a/lib/librte_pmd_e1000/e1000_ethdev.h
> > +++ b/lib/librte_pmd_e1000/e1000_ethdev.h
> > @@ -34,6 +34,8 @@
> >  #ifndef _E1000_ETHDEV_H_
> >  #define _E1000_ETHDEV_H_
> >
> > +#include <rte_spinlock.h>
> > +
> >  /* need update link, bit flag */
> >  #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
> >  #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
> > @@ -105,10 +107,14 @@
> >  #define E1000_FTQF_QUEUE_SHIFT           16
> >  #define E1000_FTQF_QUEUE_ENABLE          0x00000100
> >
> > +/* maximum number of other interrupts besides Rx & Tx interrupts */
> > +#define E1000_MAX_OTHER_INTR		1
> > +
> >  /* structure for interrupt relative data */
> >  struct e1000_interrupt {
> >  	uint32_t flags;
> >  	uint32_t mask;
> > +	rte_spinlock_t lock;
> >  };
> >
> >  /* local vfta copy */
> > diff --git a/lib/librte_pmd_e1000/igb_ethdev.c
> > b/lib/librte_pmd_e1000/igb_ethdev.c
> > index 2a268b8..7d9b103 100644
> > --- a/lib/librte_pmd_e1000/igb_ethdev.c
> > +++ b/lib/librte_pmd_e1000/igb_ethdev.c
> > @@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
> >  static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
> >  				struct rte_eth_fc_conf *fc_conf);
> >  static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
> > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
> >  static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
> >  static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
> >  static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
> > @@ -191,6 +192,14 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
> >  		     enum rte_filter_op filter_op,
> >  		     void *arg);
> >
> > +static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t
> > queue_id);
> > +static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t
> > queue_id);
> > +static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > +				uint8_t queue, uint8_t msix_vector);
> > +static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
> > +static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
> > +				uint8_t index, uint8_t offset);
> > +
> >  /*
> >   * Define VF Stats MACRO for Non "cleared on read" register
> >   */
> > @@ -250,6 +259,8 @@ static struct eth_dev_ops eth_igb_ops = {
> >  	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
> >  	.vlan_offload_set     = eth_igb_vlan_offload_set,
> >  	.rx_queue_setup       = eth_igb_rx_queue_setup,
> > +	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
> > +	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
> >  	.rx_queue_release     = eth_igb_rx_queue_release,
> >  	.rx_queue_count       = eth_igb_rx_queue_count,
> >  	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
> > @@ -592,6 +603,16 @@ eth_igb_dev_init(__attribute__((unused)) struct
> > eth_driver *eth_drv,
> >  		     eth_dev->data->port_id, pci_dev->id.vendor_id,
> >  		     pci_dev->id.device_id);
> >
> > +	/* set max interrupt vfio request */
> > +	struct rte_eth_dev_info dev_info;
> > +
> > +	memset(&dev_info, 0, sizeof(dev_info));
> > +	eth_igb_infos_get(eth_dev, &dev_info);
> > +
> > +	hw->mac.max_rx_queues = dev_info.max_rx_queues;
> > +
> > +	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> > E1000_MAX_OTHER_INTR;
> > +
> >  	rte_intr_callback_register(&(pci_dev->intr_handle),
> >  		eth_igb_interrupt_handler, (void *)eth_dev);
> >
> > @@ -754,7 +775,7 @@ eth_igb_start(struct rte_eth_dev *dev)
> >  {
> >  	struct e1000_hw *hw =
> >  		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > -	int ret, i, mask;
> > +	int ret, mask;
> >  	uint32_t ctrl_ext;
> >
> >  	PMD_INIT_FUNC_TRACE();
> > @@ -794,6 +815,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> >  	/* configure PF module if SRIOV enabled */
> >  	igb_pf_host_configure(dev);
> >
> > +	/* confiugre msix for rx interrupt */
> > +	eth_igb_configure_msix_intr(hw);
> > +
> >  	/* Configure for OS presence */
> >  	igb_init_manageability(hw);
> >
> > @@ -821,33 +845,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> >  		igb_vmdq_vlan_hw_filter_enable(dev);
> >  	}
> >
> > -	/*
> > -	 * Configure the Interrupt Moderation register (EITR) with the maximum
> > -	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
> > -	 * spurious [DMA] memory updates of RX and TX ring descriptors.
> > -	 *
> > -	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
> > -	 * spurious memory updates per second should be expected.
> > -	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
> > -	 *
> > -	 * Because interrupts are not used at all, the MSI-X is not activated
> > -	 * and interrupt moderation is controlled by EITR[0].
> > -	 *
> > -	 * Note that having [almost] disabled memory updates of RX and TX ring
> > -	 * descriptors through the Interrupt Moderation mechanism, memory
> > -	 * updates of ring descriptors are now moderated by the configurable
> > -	 * value of Write-Back Threshold registers.
> > -	 */
> >  	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)
> > ||
> >  		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210)
> > ||
> >  		(hw->mac.type == e1000_i211)) {
> > -		uint32_t ivar;
> > -
> > -		/* Enable all RX & TX queues in the IVAR registers */
> > -		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) |
> > E1000_IVAR_VALID);
> > -		for (i = 0; i < 8; i++)
> > -			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
> > -
> >  		/* Configure EITR with the maximum possible value (0xFFFF) */
> >  		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
> >  	}
> > @@ -901,6 +901,10 @@ eth_igb_start(struct rte_eth_dev *dev)
> >  	if (dev->data->dev_conf.intr_conf.lsc != 0)
> >  		ret = eth_igb_lsc_interrupt_setup(dev);
> >
> > +	/* check if rxq interrupt is enabled */
> > +	if (dev->data->dev_conf.intr_conf.rxq != 0)
> > +		eth_igb_rxq_interrupt_setup(dev);
> > +
> >  	/* resume enabled intr since hw reset */
> >  	igb_intr_enable(dev);
> >
> > @@ -1791,6 +1795,35 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
> >  		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> >
> >  	intr->mask |= E1000_ICR_LSC;
> > +	rte_spinlock_init(&(intr->lock));
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * It clears the interrupt causes and enables the interrupt.
> > + * It will be called once only during nic initialized.
> > + *
> > + * @param dev
> > + *  Pointer to struct rte_eth_dev.
> > + *
> > + * @return
> > + *  - On success, zero.
> > + *  - On failure, a negative value.
> > + */
> > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> > +{
> > +	uint32_t mask, regval;
> > +	struct e1000_hw *hw =
> > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > +	struct rte_eth_dev_info dev_info;
> > +
> > +	memset(&dev_info, 0, sizeof(dev_info));
> > +	eth_igb_infos_get(dev, &dev_info);
> > +
> > +	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
> > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> >
> >  	return 0;
> >  }
> > @@ -3256,5 +3289,152 @@ static struct rte_driver pmd_igbvf_drv = {
> >  	.init = rte_igbvf_pmd_init,
> >  };
> >
> > +static int
> > +eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
> > +{
> > +	struct e1000_hw *hw =
> > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > +	struct e1000_interrupt *intr =
> > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > +	uint32_t mask = 1 << queue_id;
> > +
> > +	rte_spinlock_lock(&(intr->lock));
> > +	E1000_WRITE_REG(hw, E1000_EIMC, mask);
> > +	E1000_WRITE_FLUSH(hw);
> > +	rte_spinlock_unlock(&(intr->lock));
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
> > +{
> > +	struct e1000_hw *hw =
> > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > +	struct e1000_interrupt *intr =
> > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > +	uint32_t mask = 1 << queue_id;
> > +	uint32_t regval;
> > +
> > +	rte_spinlock_lock(&(intr->lock));
> > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > +	E1000_WRITE_FLUSH(hw);
> > +	rte_spinlock_unlock(&(intr->lock));
> > +
> > +	return 0;
> > +}
> > +
> > +static void
> > +eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
> > +			uint8_t index, uint8_t offset)
> > +{
> > +	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
> > +
> > +	/* clear bits */
> > +	val &= ~((uint32_t)0xFF << offset);
> > +
> > +	/* write vector and valid bit */
> > +	val |= (msix_vector | E1000_IVAR_VALID) << offset;
> > +
> > +	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
> > +}
> > +
> > +static void
> > +eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > +				 uint8_t queue, uint8_t msix_vector)
> > +{
> > +	uint32_t tmp = 0;
> > +	if (hw->mac.type == e1000_82575) {
> > +		if (direction == 0)
> > +			tmp = E1000_EICR_RX_QUEUE0 << queue;
> > +		else if (direction == 1)
> > +			tmp = E1000_EICR_TX_QUEUE0 << queue;
> > +		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
> > +	} else if (hw->mac.type == e1000_82576) {
> > +		if ((direction == 0) || (direction == 1))
> > +			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
> > +					((queue & 0x8) << 1) + 8 * direction);
> > +	} else if ((hw->mac.type == e1000_82580) ||
> > +			(hw->mac.type == e1000_i350) ||
> > +			(hw->mac.type == e1000_i354) ||
> > +			(hw->mac.type == e1000_i210) ||
> > +			(hw->mac.type == e1000_i211)) {
> > +		if ((direction == 0) || (direction == 1))
> > +			eth_igb_write_ivar(hw, msix_vector,
> > +					queue >> 1,
> > +					((queue & 0x1) << 4) + 8 * direction);
> > +	}
> > +}
> > +
> > +/*
> > + * Sets up the hardware to generate MSI-X interrupts properly
> > + * @hw
> > + *  board private structure
> > + */
> > +static void
> > +eth_igb_configure_msix_intr(struct e1000_hw *hw)
> > +{
> > +	int queue_id;
> > +	uint32_t tmpval, regval, intr_mask;
> > +	uint32_t max_rx_queues = hw->mac.max_rx_queues;
> > +
> > +	/* set interrupt vector for other causes */
> > +	if (hw->mac.type == e1000_82575) {
> > +		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
> > +		/* enable MSI-X PBA support */
> > +		tmpval |= E1000_CTRL_EXT_PBA_CLR;
> > +
> > +		/* Auto-Mask interrupts upon ICR read */
> > +		tmpval |= E1000_CTRL_EXT_EIAME;
> > +		tmpval |= E1000_CTRL_EXT_IRCA;
> > +
> > +		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
> > +
> > +		/* enable msix_other interrupt */
> > +		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0,
> > E1000_EIMS_OTHER);
> > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > +		E1000_WRITE_REG(hw, E1000_EIAC, regval |
> > E1000_EIMS_OTHER);
> > +		regval = E1000_READ_REG(hw, E1000_EIAM);
> > +		E1000_WRITE_REG(hw, E1000_EIMS, regval |
> > E1000_EIMS_OTHER);
> > +	} else if ((hw->mac.type == e1000_82576) ||
> > +			(hw->mac.type == e1000_82580) ||
> > +			(hw->mac.type == e1000_i350) ||
> > +			(hw->mac.type == e1000_i354) ||
> > +			(hw->mac.type == e1000_i210) ||
> > +			(hw->mac.type == e1000_i211)) {
> > +		/* turn on MSI-X capability first */
> > +		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
> > +							E1000_GPIE_PBA |
> > E1000_GPIE_EIAME |
> > +							E1000_GPIE_NSICR);
> > +
> > +		/* enable msix_other interrupt */
> > +		intr_mask = 1 << max_rx_queues;
> > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > +		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > +		regval = E1000_READ_REG(hw, E1000_EIMS);
> > +		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
> > +		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
> > +
> > +		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
> > +	}
> > +
> > +	/*
> > +	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
> > is asserted
> > +	* this saves a register write for every interrupt
> > +	*/
> > +	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
> > +	regval = E1000_READ_REG(hw, E1000_EIAC);
> > +	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > +	regval = E1000_READ_REG(hw, E1000_EIAM);
> > +	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
> > +
> > +	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
> > +		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
> > +
> > +	E1000_WRITE_FLUSH(hw);
> > +}
> > +
> > +
> >  PMD_REGISTER_DRIVER(pmd_igb_drv);
> >  PMD_REGISTER_DRIVER(pmd_igbvf_drv);
> > --
> > 1.8.1.4
  
Zhou, Danny Feb. 13, 2015, 2:48 a.m. UTC | #3
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Thursday, February 12, 2015 6:29 PM
> To: Liang, Cunming; Zhou, Danny; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> > Sent: Thursday, February 12, 2015 10:01 AM
> > To: Zhou, Danny; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> >
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou Danny
> > > Sent: Tuesday, February 03, 2015 4:18 PM
> > > To: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > >
> > > v2 changes
> > > - Consolidate review comments related to coding style
> > >
> > > The patch does below for igb PF:
> > > - Setup NIC to generate MSI-X interrupts
> > > - Set the IVAR register to map interrupt causes to vectors
> > > - Implement interrupt enable/disable functions
> > >
> > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > Tested-by: Yong Liu <yong.liu@intel.com>
> > > ---
> > >  lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
> > >  lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
> > >  lib/librte_pmd_e1000/igb_ethdev.c     | 230
> > > ++++++++++++++++++++++++++++++----
> > >  3 files changed, 214 insertions(+), 25 deletions(-)
> > >
> > > diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > index 4dd92a3..9b999ec 100644
> > > --- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > +++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > @@ -780,6 +780,9 @@ struct e1000_mac_info {
> > >  	u16 mta_reg_count;
> > >  	u16 uta_reg_count;
> > >
> > > +	u32 max_rx_queues;
> > > +	u32 max_tx_queues;
> > > +
> > [LCM]  It can be avoid to define new things in share code.
> > The max_rx/tx_queues in mac info only used by eth_igb_configure_msix_intr().
> > It the input param won't be limit to 'hw'. If using rte_eth_dev as input, you can get all the info from dev_info.
> > The risk in share code is, on next time code merging, it won't care the change here do.
> 

Will move those two parameters to dev_info struct.

> +1

> 
> BTW, another question, do we really need a spin_lock here (and in ixgbe as I remember)?
> Usually we trying to avoid locks inside PMD code, just marking API call as 'not thread safe'.
> Konstantin
>

Will replace spin_lock with rte_atomic32_cmpset(), and return failure if interrupt enable/disable
does not take effect. Then it is decision of caller(e.g. l3fwd_power) in application rather
than PMD to spin or not depending on if it really needs to enable/disable interrupt before doing
following stuff.

> >
> > >  	/* Maximum size of the MTA register table in all supported adapters */
> > >  	#define MAX_MTA_REG 128
> > >  	u32 mta_shadow[MAX_MTA_REG];
> > > diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > index d155e77..713ca11 100644
> > > --- a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > +++ b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > @@ -34,6 +34,8 @@
> > >  #ifndef _E1000_ETHDEV_H_
> > >  #define _E1000_ETHDEV_H_
> > >
> > > +#include <rte_spinlock.h>
> > > +
> > >  /* need update link, bit flag */
> > >  #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
> > >  #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
> > > @@ -105,10 +107,14 @@
> > >  #define E1000_FTQF_QUEUE_SHIFT           16
> > >  #define E1000_FTQF_QUEUE_ENABLE          0x00000100
> > >
> > > +/* maximum number of other interrupts besides Rx & Tx interrupts */
> > > +#define E1000_MAX_OTHER_INTR		1
> > > +
> > >  /* structure for interrupt relative data */
> > >  struct e1000_interrupt {
> > >  	uint32_t flags;
> > >  	uint32_t mask;
> > > +	rte_spinlock_t lock;
> > >  };
> > >
> > >  /* local vfta copy */
> > > diff --git a/lib/librte_pmd_e1000/igb_ethdev.c
> > > b/lib/librte_pmd_e1000/igb_ethdev.c
> > > index 2a268b8..7d9b103 100644
> > > --- a/lib/librte_pmd_e1000/igb_ethdev.c
> > > +++ b/lib/librte_pmd_e1000/igb_ethdev.c
> > > @@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
> > >  static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
> > >  				struct rte_eth_fc_conf *fc_conf);
> > >  static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
> > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
> > >  static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
> > >  static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
> > >  static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
> > > @@ -191,6 +192,14 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
> > >  		     enum rte_filter_op filter_op,
> > >  		     void *arg);
> > >
> > > +static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t
> > > queue_id);
> > > +static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t
> > > queue_id);
> > > +static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > +				uint8_t queue, uint8_t msix_vector);
> > > +static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
> > > +static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
> > > +				uint8_t index, uint8_t offset);
> > > +
> > >  /*
> > >   * Define VF Stats MACRO for Non "cleared on read" register
> > >   */
> > > @@ -250,6 +259,8 @@ static struct eth_dev_ops eth_igb_ops = {
> > >  	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
> > >  	.vlan_offload_set     = eth_igb_vlan_offload_set,
> > >  	.rx_queue_setup       = eth_igb_rx_queue_setup,
> > > +	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
> > > +	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
> > >  	.rx_queue_release     = eth_igb_rx_queue_release,
> > >  	.rx_queue_count       = eth_igb_rx_queue_count,
> > >  	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
> > > @@ -592,6 +603,16 @@ eth_igb_dev_init(__attribute__((unused)) struct
> > > eth_driver *eth_drv,
> > >  		     eth_dev->data->port_id, pci_dev->id.vendor_id,
> > >  		     pci_dev->id.device_id);
> > >
> > > +	/* set max interrupt vfio request */
> > > +	struct rte_eth_dev_info dev_info;
> > > +
> > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > +	eth_igb_infos_get(eth_dev, &dev_info);
> > > +
> > > +	hw->mac.max_rx_queues = dev_info.max_rx_queues;
> > > +
> > > +	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> > > E1000_MAX_OTHER_INTR;
> > > +
> > >  	rte_intr_callback_register(&(pci_dev->intr_handle),
> > >  		eth_igb_interrupt_handler, (void *)eth_dev);
> > >
> > > @@ -754,7 +775,7 @@ eth_igb_start(struct rte_eth_dev *dev)
> > >  {
> > >  	struct e1000_hw *hw =
> > >  		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > -	int ret, i, mask;
> > > +	int ret, mask;
> > >  	uint32_t ctrl_ext;
> > >
> > >  	PMD_INIT_FUNC_TRACE();
> > > @@ -794,6 +815,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > >  	/* configure PF module if SRIOV enabled */
> > >  	igb_pf_host_configure(dev);
> > >
> > > +	/* confiugre msix for rx interrupt */
> > > +	eth_igb_configure_msix_intr(hw);
> > > +
> > >  	/* Configure for OS presence */
> > >  	igb_init_manageability(hw);
> > >
> > > @@ -821,33 +845,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > >  		igb_vmdq_vlan_hw_filter_enable(dev);
> > >  	}
> > >
> > > -	/*
> > > -	 * Configure the Interrupt Moderation register (EITR) with the maximum
> > > -	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
> > > -	 * spurious [DMA] memory updates of RX and TX ring descriptors.
> > > -	 *
> > > -	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
> > > -	 * spurious memory updates per second should be expected.
> > > -	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
> > > -	 *
> > > -	 * Because interrupts are not used at all, the MSI-X is not activated
> > > -	 * and interrupt moderation is controlled by EITR[0].
> > > -	 *
> > > -	 * Note that having [almost] disabled memory updates of RX and TX ring
> > > -	 * descriptors through the Interrupt Moderation mechanism, memory
> > > -	 * updates of ring descriptors are now moderated by the configurable
> > > -	 * value of Write-Back Threshold registers.
> > > -	 */
> > >  	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)
> > > ||
> > >  		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210)
> > > ||
> > >  		(hw->mac.type == e1000_i211)) {
> > > -		uint32_t ivar;
> > > -
> > > -		/* Enable all RX & TX queues in the IVAR registers */
> > > -		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) |
> > > E1000_IVAR_VALID);
> > > -		for (i = 0; i < 8; i++)
> > > -			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
> > > -
> > >  		/* Configure EITR with the maximum possible value (0xFFFF) */
> > >  		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
> > >  	}
> > > @@ -901,6 +901,10 @@ eth_igb_start(struct rte_eth_dev *dev)
> > >  	if (dev->data->dev_conf.intr_conf.lsc != 0)
> > >  		ret = eth_igb_lsc_interrupt_setup(dev);
> > >
> > > +	/* check if rxq interrupt is enabled */
> > > +	if (dev->data->dev_conf.intr_conf.rxq != 0)
> > > +		eth_igb_rxq_interrupt_setup(dev);
> > > +
> > >  	/* resume enabled intr since hw reset */
> > >  	igb_intr_enable(dev);
> > >
> > > @@ -1791,6 +1795,35 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
> > >  		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > >
> > >  	intr->mask |= E1000_ICR_LSC;
> > > +	rte_spinlock_init(&(intr->lock));
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * It clears the interrupt causes and enables the interrupt.
> > > + * It will be called once only during nic initialized.
> > > + *
> > > + * @param dev
> > > + *  Pointer to struct rte_eth_dev.
> > > + *
> > > + * @return
> > > + *  - On success, zero.
> > > + *  - On failure, a negative value.
> > > + */
> > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> > > +{
> > > +	uint32_t mask, regval;
> > > +	struct e1000_hw *hw =
> > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > +	struct rte_eth_dev_info dev_info;
> > > +
> > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > +	eth_igb_infos_get(dev, &dev_info);
> > > +
> > > +	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
> > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > >
> > >  	return 0;
> > >  }
> > > @@ -3256,5 +3289,152 @@ static struct rte_driver pmd_igbvf_drv = {
> > >  	.init = rte_igbvf_pmd_init,
> > >  };
> > >
> > > +static int
> > > +eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > +{
> > > +	struct e1000_hw *hw =
> > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > +	struct e1000_interrupt *intr =
> > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > +	uint32_t mask = 1 << queue_id;
> > > +
> > > +	rte_spinlock_lock(&(intr->lock));
> > > +	E1000_WRITE_REG(hw, E1000_EIMC, mask);
> > > +	E1000_WRITE_FLUSH(hw);
> > > +	rte_spinlock_unlock(&(intr->lock));
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int
> > > +eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > +{
> > > +	struct e1000_hw *hw =
> > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > +	struct e1000_interrupt *intr =
> > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > +	uint32_t mask = 1 << queue_id;
> > > +	uint32_t regval;
> > > +
> > > +	rte_spinlock_lock(&(intr->lock));
> > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > > +	E1000_WRITE_FLUSH(hw);
> > > +	rte_spinlock_unlock(&(intr->lock));
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void
> > > +eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
> > > +			uint8_t index, uint8_t offset)
> > > +{
> > > +	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
> > > +
> > > +	/* clear bits */
> > > +	val &= ~((uint32_t)0xFF << offset);
> > > +
> > > +	/* write vector and valid bit */
> > > +	val |= (msix_vector | E1000_IVAR_VALID) << offset;
> > > +
> > > +	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
> > > +}
> > > +
> > > +static void
> > > +eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > +				 uint8_t queue, uint8_t msix_vector)
> > > +{
> > > +	uint32_t tmp = 0;
> > > +	if (hw->mac.type == e1000_82575) {
> > > +		if (direction == 0)
> > > +			tmp = E1000_EICR_RX_QUEUE0 << queue;
> > > +		else if (direction == 1)
> > > +			tmp = E1000_EICR_TX_QUEUE0 << queue;
> > > +		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
> > > +	} else if (hw->mac.type == e1000_82576) {
> > > +		if ((direction == 0) || (direction == 1))
> > > +			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
> > > +					((queue & 0x8) << 1) + 8 * direction);
> > > +	} else if ((hw->mac.type == e1000_82580) ||
> > > +			(hw->mac.type == e1000_i350) ||
> > > +			(hw->mac.type == e1000_i354) ||
> > > +			(hw->mac.type == e1000_i210) ||
> > > +			(hw->mac.type == e1000_i211)) {
> > > +		if ((direction == 0) || (direction == 1))
> > > +			eth_igb_write_ivar(hw, msix_vector,
> > > +					queue >> 1,
> > > +					((queue & 0x1) << 4) + 8 * direction);
> > > +	}
> > > +}
> > > +
> > > +/*
> > > + * Sets up the hardware to generate MSI-X interrupts properly
> > > + * @hw
> > > + *  board private structure
> > > + */
> > > +static void
> > > +eth_igb_configure_msix_intr(struct e1000_hw *hw)
> > > +{
> > > +	int queue_id;
> > > +	uint32_t tmpval, regval, intr_mask;
> > > +	uint32_t max_rx_queues = hw->mac.max_rx_queues;
> > > +
> > > +	/* set interrupt vector for other causes */
> > > +	if (hw->mac.type == e1000_82575) {
> > > +		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
> > > +		/* enable MSI-X PBA support */
> > > +		tmpval |= E1000_CTRL_EXT_PBA_CLR;
> > > +
> > > +		/* Auto-Mask interrupts upon ICR read */
> > > +		tmpval |= E1000_CTRL_EXT_EIAME;
> > > +		tmpval |= E1000_CTRL_EXT_IRCA;
> > > +
> > > +		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
> > > +
> > > +		/* enable msix_other interrupt */
> > > +		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0,
> > > E1000_EIMS_OTHER);
> > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval |
> > > E1000_EIMS_OTHER);
> > > +		regval = E1000_READ_REG(hw, E1000_EIAM);
> > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval |
> > > E1000_EIMS_OTHER);
> > > +	} else if ((hw->mac.type == e1000_82576) ||
> > > +			(hw->mac.type == e1000_82580) ||
> > > +			(hw->mac.type == e1000_i350) ||
> > > +			(hw->mac.type == e1000_i354) ||
> > > +			(hw->mac.type == e1000_i210) ||
> > > +			(hw->mac.type == e1000_i211)) {
> > > +		/* turn on MSI-X capability first */
> > > +		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
> > > +							E1000_GPIE_PBA |
> > > E1000_GPIE_EIAME |
> > > +							E1000_GPIE_NSICR);
> > > +
> > > +		/* enable msix_other interrupt */
> > > +		intr_mask = 1 << max_rx_queues;
> > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > +		regval = E1000_READ_REG(hw, E1000_EIMS);
> > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
> > > +		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
> > > +
> > > +		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
> > > +	}
> > > +
> > > +	/*
> > > +	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
> > > is asserted
> > > +	* this saves a register write for every interrupt
> > > +	*/
> > > +	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
> > > +	regval = E1000_READ_REG(hw, E1000_EIAC);
> > > +	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > +	regval = E1000_READ_REG(hw, E1000_EIAM);
> > > +	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
> > > +
> > > +	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
> > > +		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
> > > +
> > > +	E1000_WRITE_FLUSH(hw);
> > > +}
> > > +
> > > +
> > >  PMD_REGISTER_DRIVER(pmd_igb_drv);
> > >  PMD_REGISTER_DRIVER(pmd_igbvf_drv);
> > > --
> > > 1.8.1.4
  
Zhou, Danny Feb. 13, 2015, 7:20 a.m. UTC | #4
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou, Danny
> Sent: Friday, February 13, 2015 10:48 AM
> To: Ananyev, Konstantin; Liang, Cunming; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Thursday, February 12, 2015 6:29 PM
> > To: Liang, Cunming; Zhou, Danny; dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> >
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> > > Sent: Thursday, February 12, 2015 10:01 AM
> > > To: Zhou, Danny; dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou Danny
> > > > Sent: Tuesday, February 03, 2015 4:18 PM
> > > > To: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > > >
> > > > v2 changes
> > > > - Consolidate review comments related to coding style
> > > >
> > > > The patch does below for igb PF:
> > > > - Setup NIC to generate MSI-X interrupts
> > > > - Set the IVAR register to map interrupt causes to vectors
> > > > - Implement interrupt enable/disable functions
> > > >
> > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > Tested-by: Yong Liu <yong.liu@intel.com>
> > > > ---
> > > >  lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
> > > >  lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
> > > >  lib/librte_pmd_e1000/igb_ethdev.c     | 230
> > > > ++++++++++++++++++++++++++++++----
> > > >  3 files changed, 214 insertions(+), 25 deletions(-)
> > > >
> > > > diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > index 4dd92a3..9b999ec 100644
> > > > --- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > +++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > @@ -780,6 +780,9 @@ struct e1000_mac_info {
> > > >  	u16 mta_reg_count;
> > > >  	u16 uta_reg_count;
> > > >
> > > > +	u32 max_rx_queues;
> > > > +	u32 max_tx_queues;
> > > > +
> > > [LCM]  It can be avoid to define new things in share code.
> > > The max_rx/tx_queues in mac info only used by eth_igb_configure_msix_intr().
> > > It the input param won't be limit to 'hw'. If using rte_eth_dev as input, you can get all the info from dev_info.
> > > The risk in share code is, on next time code merging, it won't care the change here do.
> >
> 
> Will move those two parameters to dev_info struct.
> 
> > +1
> 
> >
> > BTW, another question, do we really need a spin_lock here (and in ixgbe as I remember)?
> > Usually we trying to avoid locks inside PMD code, just marking API call as 'not thread safe'.
> > Konstantin
> >
> 
> Will replace spin_lock with rte_atomic32_cmpset(), and return failure if interrupt enable/disable
> does not take effect. Then it is decision of caller(e.g. l3fwd_power) in application rather
> than PMD to spin or not depending on if it really needs to enable/disable interrupt before doing
> following stuff.
> 

Looks this is not best way to make these two functions "thread safe", will remove any check and
Mark them as "thread unsafe". In addition, move spin_lock stuff to L3fwd_power. Tests shows we
have to introduce locks to ensure correct per-queue interrupt and polling mode switch control, especially
when lots of queues are enabled.

> > >
> > > >  	/* Maximum size of the MTA register table in all supported adapters */
> > > >  	#define MAX_MTA_REG 128
> > > >  	u32 mta_shadow[MAX_MTA_REG];
> > > > diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > index d155e77..713ca11 100644
> > > > --- a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > +++ b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > @@ -34,6 +34,8 @@
> > > >  #ifndef _E1000_ETHDEV_H_
> > > >  #define _E1000_ETHDEV_H_
> > > >
> > > > +#include <rte_spinlock.h>
> > > > +
> > > >  /* need update link, bit flag */
> > > >  #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
> > > >  #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
> > > > @@ -105,10 +107,14 @@
> > > >  #define E1000_FTQF_QUEUE_SHIFT           16
> > > >  #define E1000_FTQF_QUEUE_ENABLE          0x00000100
> > > >
> > > > +/* maximum number of other interrupts besides Rx & Tx interrupts */
> > > > +#define E1000_MAX_OTHER_INTR		1
> > > > +
> > > >  /* structure for interrupt relative data */
> > > >  struct e1000_interrupt {
> > > >  	uint32_t flags;
> > > >  	uint32_t mask;
> > > > +	rte_spinlock_t lock;
> > > >  };
> > > >
> > > >  /* local vfta copy */
> > > > diff --git a/lib/librte_pmd_e1000/igb_ethdev.c
> > > > b/lib/librte_pmd_e1000/igb_ethdev.c
> > > > index 2a268b8..7d9b103 100644
> > > > --- a/lib/librte_pmd_e1000/igb_ethdev.c
> > > > +++ b/lib/librte_pmd_e1000/igb_ethdev.c
> > > > @@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
> > > >  static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
> > > >  				struct rte_eth_fc_conf *fc_conf);
> > > >  static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
> > > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
> > > >  static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
> > > >  static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
> > > >  static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
> > > > @@ -191,6 +192,14 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
> > > >  		     enum rte_filter_op filter_op,
> > > >  		     void *arg);
> > > >
> > > > +static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t
> > > > queue_id);
> > > > +static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t
> > > > queue_id);
> > > > +static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > > +				uint8_t queue, uint8_t msix_vector);
> > > > +static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
> > > > +static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
> > > > +				uint8_t index, uint8_t offset);
> > > > +
> > > >  /*
> > > >   * Define VF Stats MACRO for Non "cleared on read" register
> > > >   */
> > > > @@ -250,6 +259,8 @@ static struct eth_dev_ops eth_igb_ops = {
> > > >  	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
> > > >  	.vlan_offload_set     = eth_igb_vlan_offload_set,
> > > >  	.rx_queue_setup       = eth_igb_rx_queue_setup,
> > > > +	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
> > > > +	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
> > > >  	.rx_queue_release     = eth_igb_rx_queue_release,
> > > >  	.rx_queue_count       = eth_igb_rx_queue_count,
> > > >  	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
> > > > @@ -592,6 +603,16 @@ eth_igb_dev_init(__attribute__((unused)) struct
> > > > eth_driver *eth_drv,
> > > >  		     eth_dev->data->port_id, pci_dev->id.vendor_id,
> > > >  		     pci_dev->id.device_id);
> > > >
> > > > +	/* set max interrupt vfio request */
> > > > +	struct rte_eth_dev_info dev_info;
> > > > +
> > > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > > +	eth_igb_infos_get(eth_dev, &dev_info);
> > > > +
> > > > +	hw->mac.max_rx_queues = dev_info.max_rx_queues;
> > > > +
> > > > +	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> > > > E1000_MAX_OTHER_INTR;
> > > > +
> > > >  	rte_intr_callback_register(&(pci_dev->intr_handle),
> > > >  		eth_igb_interrupt_handler, (void *)eth_dev);
> > > >
> > > > @@ -754,7 +775,7 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > >  {
> > > >  	struct e1000_hw *hw =
> > > >  		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > -	int ret, i, mask;
> > > > +	int ret, mask;
> > > >  	uint32_t ctrl_ext;
> > > >
> > > >  	PMD_INIT_FUNC_TRACE();
> > > > @@ -794,6 +815,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > >  	/* configure PF module if SRIOV enabled */
> > > >  	igb_pf_host_configure(dev);
> > > >
> > > > +	/* confiugre msix for rx interrupt */
> > > > +	eth_igb_configure_msix_intr(hw);
> > > > +
> > > >  	/* Configure for OS presence */
> > > >  	igb_init_manageability(hw);
> > > >
> > > > @@ -821,33 +845,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > >  		igb_vmdq_vlan_hw_filter_enable(dev);
> > > >  	}
> > > >
> > > > -	/*
> > > > -	 * Configure the Interrupt Moderation register (EITR) with the maximum
> > > > -	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
> > > > -	 * spurious [DMA] memory updates of RX and TX ring descriptors.
> > > > -	 *
> > > > -	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
> > > > -	 * spurious memory updates per second should be expected.
> > > > -	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
> > > > -	 *
> > > > -	 * Because interrupts are not used at all, the MSI-X is not activated
> > > > -	 * and interrupt moderation is controlled by EITR[0].
> > > > -	 *
> > > > -	 * Note that having [almost] disabled memory updates of RX and TX ring
> > > > -	 * descriptors through the Interrupt Moderation mechanism, memory
> > > > -	 * updates of ring descriptors are now moderated by the configurable
> > > > -	 * value of Write-Back Threshold registers.
> > > > -	 */
> > > >  	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)
> > > > ||
> > > >  		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210)
> > > > ||
> > > >  		(hw->mac.type == e1000_i211)) {
> > > > -		uint32_t ivar;
> > > > -
> > > > -		/* Enable all RX & TX queues in the IVAR registers */
> > > > -		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) |
> > > > E1000_IVAR_VALID);
> > > > -		for (i = 0; i < 8; i++)
> > > > -			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
> > > > -
> > > >  		/* Configure EITR with the maximum possible value (0xFFFF) */
> > > >  		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
> > > >  	}
> > > > @@ -901,6 +901,10 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > >  	if (dev->data->dev_conf.intr_conf.lsc != 0)
> > > >  		ret = eth_igb_lsc_interrupt_setup(dev);
> > > >
> > > > +	/* check if rxq interrupt is enabled */
> > > > +	if (dev->data->dev_conf.intr_conf.rxq != 0)
> > > > +		eth_igb_rxq_interrupt_setup(dev);
> > > > +
> > > >  	/* resume enabled intr since hw reset */
> > > >  	igb_intr_enable(dev);
> > > >
> > > > @@ -1791,6 +1795,35 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
> > > >  		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > >
> > > >  	intr->mask |= E1000_ICR_LSC;
> > > > +	rte_spinlock_init(&(intr->lock));
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * It clears the interrupt causes and enables the interrupt.
> > > > + * It will be called once only during nic initialized.
> > > > + *
> > > > + * @param dev
> > > > + *  Pointer to struct rte_eth_dev.
> > > > + *
> > > > + * @return
> > > > + *  - On success, zero.
> > > > + *  - On failure, a negative value.
> > > > + */
> > > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> > > > +{
> > > > +	uint32_t mask, regval;
> > > > +	struct e1000_hw *hw =
> > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > +	struct rte_eth_dev_info dev_info;
> > > > +
> > > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > > +	eth_igb_infos_get(dev, &dev_info);
> > > > +
> > > > +	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
> > > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > > >
> > > >  	return 0;
> > > >  }
> > > > @@ -3256,5 +3289,152 @@ static struct rte_driver pmd_igbvf_drv = {
> > > >  	.init = rte_igbvf_pmd_init,
> > > >  };
> > > >
> > > > +static int
> > > > +eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > > +{
> > > > +	struct e1000_hw *hw =
> > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > +	struct e1000_interrupt *intr =
> > > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > > +	uint32_t mask = 1 << queue_id;
> > > > +
> > > > +	rte_spinlock_lock(&(intr->lock));
> > > > +	E1000_WRITE_REG(hw, E1000_EIMC, mask);
> > > > +	E1000_WRITE_FLUSH(hw);
> > > > +	rte_spinlock_unlock(&(intr->lock));
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > > +{
> > > > +	struct e1000_hw *hw =
> > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > +	struct e1000_interrupt *intr =
> > > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > > +	uint32_t mask = 1 << queue_id;
> > > > +	uint32_t regval;
> > > > +
> > > > +	rte_spinlock_lock(&(intr->lock));
> > > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > > > +	E1000_WRITE_FLUSH(hw);
> > > > +	rte_spinlock_unlock(&(intr->lock));
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static void
> > > > +eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
> > > > +			uint8_t index, uint8_t offset)
> > > > +{
> > > > +	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
> > > > +
> > > > +	/* clear bits */
> > > > +	val &= ~((uint32_t)0xFF << offset);
> > > > +
> > > > +	/* write vector and valid bit */
> > > > +	val |= (msix_vector | E1000_IVAR_VALID) << offset;
> > > > +
> > > > +	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
> > > > +}
> > > > +
> > > > +static void
> > > > +eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > > +				 uint8_t queue, uint8_t msix_vector)
> > > > +{
> > > > +	uint32_t tmp = 0;
> > > > +	if (hw->mac.type == e1000_82575) {
> > > > +		if (direction == 0)
> > > > +			tmp = E1000_EICR_RX_QUEUE0 << queue;
> > > > +		else if (direction == 1)
> > > > +			tmp = E1000_EICR_TX_QUEUE0 << queue;
> > > > +		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
> > > > +	} else if (hw->mac.type == e1000_82576) {
> > > > +		if ((direction == 0) || (direction == 1))
> > > > +			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
> > > > +					((queue & 0x8) << 1) + 8 * direction);
> > > > +	} else if ((hw->mac.type == e1000_82580) ||
> > > > +			(hw->mac.type == e1000_i350) ||
> > > > +			(hw->mac.type == e1000_i354) ||
> > > > +			(hw->mac.type == e1000_i210) ||
> > > > +			(hw->mac.type == e1000_i211)) {
> > > > +		if ((direction == 0) || (direction == 1))
> > > > +			eth_igb_write_ivar(hw, msix_vector,
> > > > +					queue >> 1,
> > > > +					((queue & 0x1) << 4) + 8 * direction);
> > > > +	}
> > > > +}
> > > > +
> > > > +/*
> > > > + * Sets up the hardware to generate MSI-X interrupts properly
> > > > + * @hw
> > > > + *  board private structure
> > > > + */
> > > > +static void
> > > > +eth_igb_configure_msix_intr(struct e1000_hw *hw)
> > > > +{
> > > > +	int queue_id;
> > > > +	uint32_t tmpval, regval, intr_mask;
> > > > +	uint32_t max_rx_queues = hw->mac.max_rx_queues;
> > > > +
> > > > +	/* set interrupt vector for other causes */
> > > > +	if (hw->mac.type == e1000_82575) {
> > > > +		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
> > > > +		/* enable MSI-X PBA support */
> > > > +		tmpval |= E1000_CTRL_EXT_PBA_CLR;
> > > > +
> > > > +		/* Auto-Mask interrupts upon ICR read */
> > > > +		tmpval |= E1000_CTRL_EXT_EIAME;
> > > > +		tmpval |= E1000_CTRL_EXT_IRCA;
> > > > +
> > > > +		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
> > > > +
> > > > +		/* enable msix_other interrupt */
> > > > +		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0,
> > > > E1000_EIMS_OTHER);
> > > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval |
> > > > E1000_EIMS_OTHER);
> > > > +		regval = E1000_READ_REG(hw, E1000_EIAM);
> > > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval |
> > > > E1000_EIMS_OTHER);
> > > > +	} else if ((hw->mac.type == e1000_82576) ||
> > > > +			(hw->mac.type == e1000_82580) ||
> > > > +			(hw->mac.type == e1000_i350) ||
> > > > +			(hw->mac.type == e1000_i354) ||
> > > > +			(hw->mac.type == e1000_i210) ||
> > > > +			(hw->mac.type == e1000_i211)) {
> > > > +		/* turn on MSI-X capability first */
> > > > +		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
> > > > +							E1000_GPIE_PBA |
> > > > E1000_GPIE_EIAME |
> > > > +							E1000_GPIE_NSICR);
> > > > +
> > > > +		/* enable msix_other interrupt */
> > > > +		intr_mask = 1 << max_rx_queues;
> > > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > > +		regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
> > > > +		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
> > > > +
> > > > +		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
> > > > is asserted
> > > > +	* this saves a register write for every interrupt
> > > > +	*/
> > > > +	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
> > > > +	regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > +	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > > +	regval = E1000_READ_REG(hw, E1000_EIAM);
> > > > +	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
> > > > +
> > > > +	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
> > > > +		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
> > > > +
> > > > +	E1000_WRITE_FLUSH(hw);
> > > > +}
> > > > +
> > > > +
> > > >  PMD_REGISTER_DRIVER(pmd_igb_drv);
> > > >  PMD_REGISTER_DRIVER(pmd_igbvf_drv);
> > > > --
> > > > 1.8.1.4
  
Ananyev, Konstantin Feb. 13, 2015, 9:53 a.m. UTC | #5
> -----Original Message-----
> From: Zhou, Danny
> Sent: Friday, February 13, 2015 7:21 AM
> To: Ananyev, Konstantin; Liang, Cunming; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou, Danny
> > Sent: Friday, February 13, 2015 10:48 AM
> > To: Ananyev, Konstantin; Liang, Cunming; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> >
> >
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Thursday, February 12, 2015 6:29 PM
> > > To: Liang, Cunming; Zhou, Danny; dev@dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> > > > Sent: Thursday, February 12, 2015 10:01 AM
> > > > To: Zhou, Danny; dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Zhou Danny
> > > > > Sent: Tuesday, February 03, 2015 4:18 PM
> > > > > To: dev@dpdk.org
> > > > > Subject: [dpdk-dev] [PATCH v2 3/5] igb: enable rx queue interrupts for PF
> > > > >
> > > > > v2 changes
> > > > > - Consolidate review comments related to coding style
> > > > >
> > > > > The patch does below for igb PF:
> > > > > - Setup NIC to generate MSI-X interrupts
> > > > > - Set the IVAR register to map interrupt causes to vectors
> > > > > - Implement interrupt enable/disable functions
> > > > >
> > > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > > Tested-by: Yong Liu <yong.liu@intel.com>
> > > > > ---
> > > > >  lib/librte_pmd_e1000/e1000/e1000_hw.h |   3 +
> > > > >  lib/librte_pmd_e1000/e1000_ethdev.h   |   6 +
> > > > >  lib/librte_pmd_e1000/igb_ethdev.c     | 230
> > > > > ++++++++++++++++++++++++++++++----
> > > > >  3 files changed, 214 insertions(+), 25 deletions(-)
> > > > >
> > > > > diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > > b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > > index 4dd92a3..9b999ec 100644
> > > > > --- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > > +++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
> > > > > @@ -780,6 +780,9 @@ struct e1000_mac_info {
> > > > >  	u16 mta_reg_count;
> > > > >  	u16 uta_reg_count;
> > > > >
> > > > > +	u32 max_rx_queues;
> > > > > +	u32 max_tx_queues;
> > > > > +
> > > > [LCM]  It can be avoid to define new things in share code.
> > > > The max_rx/tx_queues in mac info only used by eth_igb_configure_msix_intr().
> > > > It the input param won't be limit to 'hw'. If using rte_eth_dev as input, you can get all the info from dev_info.
> > > > The risk in share code is, on next time code merging, it won't care the change here do.
> > >
> >
> > Will move those two parameters to dev_info struct.
> >
> > > +1
> >
> > >
> > > BTW, another question, do we really need a spin_lock here (and in ixgbe as I remember)?
> > > Usually we trying to avoid locks inside PMD code, just marking API call as 'not thread safe'.
> > > Konstantin
> > >
> >
> > Will replace spin_lock with rte_atomic32_cmpset(), and return failure if interrupt enable/disable
> > does not take effect. Then it is decision of caller(e.g. l3fwd_power) in application rather
> > than PMD to spin or not depending on if it really needs to enable/disable interrupt before doing
> > following stuff.
> >
> 
> Looks this is not best way to make these two functions "thread safe", will remove any check and
> Mark them as "thread unsafe". In addition, move spin_lock stuff to L3fwd_power. Tests shows we
> have to introduce locks to ensure correct per-queue interrupt and polling mode switch control, especially
> when lots of queues are enabled.

Sounds good to me.
Thanks
Konstantin

> 
> > > >
> > > > >  	/* Maximum size of the MTA register table in all supported adapters */
> > > > >  	#define MAX_MTA_REG 128
> > > > >  	u32 mta_shadow[MAX_MTA_REG];
> > > > > diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > > b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > > index d155e77..713ca11 100644
> > > > > --- a/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > > +++ b/lib/librte_pmd_e1000/e1000_ethdev.h
> > > > > @@ -34,6 +34,8 @@
> > > > >  #ifndef _E1000_ETHDEV_H_
> > > > >  #define _E1000_ETHDEV_H_
> > > > >
> > > > > +#include <rte_spinlock.h>
> > > > > +
> > > > >  /* need update link, bit flag */
> > > > >  #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
> > > > >  #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
> > > > > @@ -105,10 +107,14 @@
> > > > >  #define E1000_FTQF_QUEUE_SHIFT           16
> > > > >  #define E1000_FTQF_QUEUE_ENABLE          0x00000100
> > > > >
> > > > > +/* maximum number of other interrupts besides Rx & Tx interrupts */
> > > > > +#define E1000_MAX_OTHER_INTR		1
> > > > > +
> > > > >  /* structure for interrupt relative data */
> > > > >  struct e1000_interrupt {
> > > > >  	uint32_t flags;
> > > > >  	uint32_t mask;
> > > > > +	rte_spinlock_t lock;
> > > > >  };
> > > > >
> > > > >  /* local vfta copy */
> > > > > diff --git a/lib/librte_pmd_e1000/igb_ethdev.c
> > > > > b/lib/librte_pmd_e1000/igb_ethdev.c
> > > > > index 2a268b8..7d9b103 100644
> > > > > --- a/lib/librte_pmd_e1000/igb_ethdev.c
> > > > > +++ b/lib/librte_pmd_e1000/igb_ethdev.c
> > > > > @@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
> > > > >  static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
> > > > >  				struct rte_eth_fc_conf *fc_conf);
> > > > >  static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
> > > > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
> > > > >  static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
> > > > >  static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
> > > > >  static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
> > > > > @@ -191,6 +192,14 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
> > > > >  		     enum rte_filter_op filter_op,
> > > > >  		     void *arg);
> > > > >
> > > > > +static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t
> > > > > queue_id);
> > > > > +static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t
> > > > > queue_id);
> > > > > +static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > > > +				uint8_t queue, uint8_t msix_vector);
> > > > > +static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
> > > > > +static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
> > > > > +				uint8_t index, uint8_t offset);
> > > > > +
> > > > >  /*
> > > > >   * Define VF Stats MACRO for Non "cleared on read" register
> > > > >   */
> > > > > @@ -250,6 +259,8 @@ static struct eth_dev_ops eth_igb_ops = {
> > > > >  	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
> > > > >  	.vlan_offload_set     = eth_igb_vlan_offload_set,
> > > > >  	.rx_queue_setup       = eth_igb_rx_queue_setup,
> > > > > +	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
> > > > > +	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
> > > > >  	.rx_queue_release     = eth_igb_rx_queue_release,
> > > > >  	.rx_queue_count       = eth_igb_rx_queue_count,
> > > > >  	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
> > > > > @@ -592,6 +603,16 @@ eth_igb_dev_init(__attribute__((unused)) struct
> > > > > eth_driver *eth_drv,
> > > > >  		     eth_dev->data->port_id, pci_dev->id.vendor_id,
> > > > >  		     pci_dev->id.device_id);
> > > > >
> > > > > +	/* set max interrupt vfio request */
> > > > > +	struct rte_eth_dev_info dev_info;
> > > > > +
> > > > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > > > +	eth_igb_infos_get(eth_dev, &dev_info);
> > > > > +
> > > > > +	hw->mac.max_rx_queues = dev_info.max_rx_queues;
> > > > > +
> > > > > +	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> > > > > E1000_MAX_OTHER_INTR;
> > > > > +
> > > > >  	rte_intr_callback_register(&(pci_dev->intr_handle),
> > > > >  		eth_igb_interrupt_handler, (void *)eth_dev);
> > > > >
> > > > > @@ -754,7 +775,7 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > > >  {
> > > > >  	struct e1000_hw *hw =
> > > > >  		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > > -	int ret, i, mask;
> > > > > +	int ret, mask;
> > > > >  	uint32_t ctrl_ext;
> > > > >
> > > > >  	PMD_INIT_FUNC_TRACE();
> > > > > @@ -794,6 +815,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > > >  	/* configure PF module if SRIOV enabled */
> > > > >  	igb_pf_host_configure(dev);
> > > > >
> > > > > +	/* confiugre msix for rx interrupt */
> > > > > +	eth_igb_configure_msix_intr(hw);
> > > > > +
> > > > >  	/* Configure for OS presence */
> > > > >  	igb_init_manageability(hw);
> > > > >
> > > > > @@ -821,33 +845,9 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > > >  		igb_vmdq_vlan_hw_filter_enable(dev);
> > > > >  	}
> > > > >
> > > > > -	/*
> > > > > -	 * Configure the Interrupt Moderation register (EITR) with the maximum
> > > > > -	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
> > > > > -	 * spurious [DMA] memory updates of RX and TX ring descriptors.
> > > > > -	 *
> > > > > -	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
> > > > > -	 * spurious memory updates per second should be expected.
> > > > > -	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
> > > > > -	 *
> > > > > -	 * Because interrupts are not used at all, the MSI-X is not activated
> > > > > -	 * and interrupt moderation is controlled by EITR[0].
> > > > > -	 *
> > > > > -	 * Note that having [almost] disabled memory updates of RX and TX ring
> > > > > -	 * descriptors through the Interrupt Moderation mechanism, memory
> > > > > -	 * updates of ring descriptors are now moderated by the configurable
> > > > > -	 * value of Write-Back Threshold registers.
> > > > > -	 */
> > > > >  	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)
> > > > > ||
> > > > >  		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210)
> > > > > ||
> > > > >  		(hw->mac.type == e1000_i211)) {
> > > > > -		uint32_t ivar;
> > > > > -
> > > > > -		/* Enable all RX & TX queues in the IVAR registers */
> > > > > -		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) |
> > > > > E1000_IVAR_VALID);
> > > > > -		for (i = 0; i < 8; i++)
> > > > > -			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
> > > > > -
> > > > >  		/* Configure EITR with the maximum possible value (0xFFFF) */
> > > > >  		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
> > > > >  	}
> > > > > @@ -901,6 +901,10 @@ eth_igb_start(struct rte_eth_dev *dev)
> > > > >  	if (dev->data->dev_conf.intr_conf.lsc != 0)
> > > > >  		ret = eth_igb_lsc_interrupt_setup(dev);
> > > > >
> > > > > +	/* check if rxq interrupt is enabled */
> > > > > +	if (dev->data->dev_conf.intr_conf.rxq != 0)
> > > > > +		eth_igb_rxq_interrupt_setup(dev);
> > > > > +
> > > > >  	/* resume enabled intr since hw reset */
> > > > >  	igb_intr_enable(dev);
> > > > >
> > > > > @@ -1791,6 +1795,35 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
> > > > >  		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > > >
> > > > >  	intr->mask |= E1000_ICR_LSC;
> > > > > +	rte_spinlock_init(&(intr->lock));
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * It clears the interrupt causes and enables the interrupt.
> > > > > + * It will be called once only during nic initialized.
> > > > > + *
> > > > > + * @param dev
> > > > > + *  Pointer to struct rte_eth_dev.
> > > > > + *
> > > > > + * @return
> > > > > + *  - On success, zero.
> > > > > + *  - On failure, a negative value.
> > > > > + */
> > > > > +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> > > > > +{
> > > > > +	uint32_t mask, regval;
> > > > > +	struct e1000_hw *hw =
> > > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > > +	struct rte_eth_dev_info dev_info;
> > > > > +
> > > > > +	memset(&dev_info, 0, sizeof(dev_info));
> > > > > +	eth_igb_infos_get(dev, &dev_info);
> > > > > +
> > > > > +	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
> > > > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > > > >
> > > > >  	return 0;
> > > > >  }
> > > > > @@ -3256,5 +3289,152 @@ static struct rte_driver pmd_igbvf_drv = {
> > > > >  	.init = rte_igbvf_pmd_init,
> > > > >  };
> > > > >
> > > > > +static int
> > > > > +eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > > > +{
> > > > > +	struct e1000_hw *hw =
> > > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > > +	struct e1000_interrupt *intr =
> > > > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > > > +	uint32_t mask = 1 << queue_id;
> > > > > +
> > > > > +	rte_spinlock_lock(&(intr->lock));
> > > > > +	E1000_WRITE_REG(hw, E1000_EIMC, mask);
> > > > > +	E1000_WRITE_FLUSH(hw);
> > > > > +	rte_spinlock_unlock(&(intr->lock));
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
> > > > > +{
> > > > > +	struct e1000_hw *hw =
> > > > > +		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> > > > > +	struct e1000_interrupt *intr =
> > > > > +		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
> > > > > +	uint32_t mask = 1 << queue_id;
> > > > > +	uint32_t regval;
> > > > > +
> > > > > +	rte_spinlock_lock(&(intr->lock));
> > > > > +	regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > > +	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
> > > > > +	E1000_WRITE_FLUSH(hw);
> > > > > +	rte_spinlock_unlock(&(intr->lock));
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
> > > > > +			uint8_t index, uint8_t offset)
> > > > > +{
> > > > > +	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
> > > > > +
> > > > > +	/* clear bits */
> > > > > +	val &= ~((uint32_t)0xFF << offset);
> > > > > +
> > > > > +	/* write vector and valid bit */
> > > > > +	val |= (msix_vector | E1000_IVAR_VALID) << offset;
> > > > > +
> > > > > +	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
> > > > > +				 uint8_t queue, uint8_t msix_vector)
> > > > > +{
> > > > > +	uint32_t tmp = 0;
> > > > > +	if (hw->mac.type == e1000_82575) {
> > > > > +		if (direction == 0)
> > > > > +			tmp = E1000_EICR_RX_QUEUE0 << queue;
> > > > > +		else if (direction == 1)
> > > > > +			tmp = E1000_EICR_TX_QUEUE0 << queue;
> > > > > +		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
> > > > > +	} else if (hw->mac.type == e1000_82576) {
> > > > > +		if ((direction == 0) || (direction == 1))
> > > > > +			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
> > > > > +					((queue & 0x8) << 1) + 8 * direction);
> > > > > +	} else if ((hw->mac.type == e1000_82580) ||
> > > > > +			(hw->mac.type == e1000_i350) ||
> > > > > +			(hw->mac.type == e1000_i354) ||
> > > > > +			(hw->mac.type == e1000_i210) ||
> > > > > +			(hw->mac.type == e1000_i211)) {
> > > > > +		if ((direction == 0) || (direction == 1))
> > > > > +			eth_igb_write_ivar(hw, msix_vector,
> > > > > +					queue >> 1,
> > > > > +					((queue & 0x1) << 4) + 8 * direction);
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Sets up the hardware to generate MSI-X interrupts properly
> > > > > + * @hw
> > > > > + *  board private structure
> > > > > + */
> > > > > +static void
> > > > > +eth_igb_configure_msix_intr(struct e1000_hw *hw)
> > > > > +{
> > > > > +	int queue_id;
> > > > > +	uint32_t tmpval, regval, intr_mask;
> > > > > +	uint32_t max_rx_queues = hw->mac.max_rx_queues;
> > > > > +
> > > > > +	/* set interrupt vector for other causes */
> > > > > +	if (hw->mac.type == e1000_82575) {
> > > > > +		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
> > > > > +		/* enable MSI-X PBA support */
> > > > > +		tmpval |= E1000_CTRL_EXT_PBA_CLR;
> > > > > +
> > > > > +		/* Auto-Mask interrupts upon ICR read */
> > > > > +		tmpval |= E1000_CTRL_EXT_EIAME;
> > > > > +		tmpval |= E1000_CTRL_EXT_IRCA;
> > > > > +
> > > > > +		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
> > > > > +
> > > > > +		/* enable msix_other interrupt */
> > > > > +		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0,
> > > > > E1000_EIMS_OTHER);
> > > > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval |
> > > > > E1000_EIMS_OTHER);
> > > > > +		regval = E1000_READ_REG(hw, E1000_EIAM);
> > > > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval |
> > > > > E1000_EIMS_OTHER);
> > > > > +	} else if ((hw->mac.type == e1000_82576) ||
> > > > > +			(hw->mac.type == e1000_82580) ||
> > > > > +			(hw->mac.type == e1000_i350) ||
> > > > > +			(hw->mac.type == e1000_i354) ||
> > > > > +			(hw->mac.type == e1000_i210) ||
> > > > > +			(hw->mac.type == e1000_i211)) {
> > > > > +		/* turn on MSI-X capability first */
> > > > > +		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
> > > > > +							E1000_GPIE_PBA |
> > > > > E1000_GPIE_EIAME |
> > > > > +							E1000_GPIE_NSICR);
> > > > > +
> > > > > +		/* enable msix_other interrupt */
> > > > > +		intr_mask = 1 << max_rx_queues;
> > > > > +		regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > > +		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > > > +		regval = E1000_READ_REG(hw, E1000_EIMS);
> > > > > +		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
> > > > > +		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
> > > > > +
> > > > > +		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
> > > > > +	}
> > > > > +
> > > > > +	/*
> > > > > +	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
> > > > > is asserted
> > > > > +	* this saves a register write for every interrupt
> > > > > +	*/
> > > > > +	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
> > > > > +	regval = E1000_READ_REG(hw, E1000_EIAC);
> > > > > +	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
> > > > > +	regval = E1000_READ_REG(hw, E1000_EIAM);
> > > > > +	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
> > > > > +
> > > > > +	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
> > > > > +		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
> > > > > +
> > > > > +	E1000_WRITE_FLUSH(hw);
> > > > > +}
> > > > > +
> > > > > +
> > > > >  PMD_REGISTER_DRIVER(pmd_igb_drv);
> > > > >  PMD_REGISTER_DRIVER(pmd_igbvf_drv);
> > > > > --
> > > > > 1.8.1.4
  

Patch

diff --git a/lib/librte_pmd_e1000/e1000/e1000_hw.h b/lib/librte_pmd_e1000/e1000/e1000_hw.h
index 4dd92a3..9b999ec 100644
--- a/lib/librte_pmd_e1000/e1000/e1000_hw.h
+++ b/lib/librte_pmd_e1000/e1000/e1000_hw.h
@@ -780,6 +780,9 @@  struct e1000_mac_info {
 	u16 mta_reg_count;
 	u16 uta_reg_count;
 
+	u32 max_rx_queues;
+	u32 max_tx_queues;
+
 	/* Maximum size of the MTA register table in all supported adapters */
 	#define MAX_MTA_REG 128
 	u32 mta_shadow[MAX_MTA_REG];
diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h b/lib/librte_pmd_e1000/e1000_ethdev.h
index d155e77..713ca11 100644
--- a/lib/librte_pmd_e1000/e1000_ethdev.h
+++ b/lib/librte_pmd_e1000/e1000_ethdev.h
@@ -34,6 +34,8 @@ 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
 
+#include <rte_spinlock.h>
+
 /* need update link, bit flag */
 #define E1000_FLAG_NEED_LINK_UPDATE (uint32_t)(1 << 0)
 #define E1000_FLAG_MAILBOX          (uint32_t)(1 << 1)
@@ -105,10 +107,14 @@ 
 #define E1000_FTQF_QUEUE_SHIFT           16
 #define E1000_FTQF_QUEUE_ENABLE          0x00000100
 
+/* maximum number of other interrupts besides Rx & Tx interrupts */
+#define E1000_MAX_OTHER_INTR		1
+
 /* structure for interrupt relative data */
 struct e1000_interrupt {
 	uint32_t flags;
 	uint32_t mask;
+	rte_spinlock_t lock;
 };
 
 /* local vfta copy */
diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 2a268b8..7d9b103 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -97,6 +97,7 @@  static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -191,6 +192,14 @@  static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct  e1000_hw *hw);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -250,6 +259,8 @@  static struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -592,6 +603,16 @@  eth_igb_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(eth_dev, &dev_info);
+
+	hw->mac.max_rx_queues = dev_info.max_rx_queues;
+
+	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues + E1000_MAX_OTHER_INTR;
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		eth_igb_interrupt_handler, (void *)eth_dev);
 
@@ -754,7 +775,7 @@  eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -794,6 +815,9 @@  eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(hw);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -821,33 +845,9 @@  eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -901,6 +901,10 @@  eth_igb_start(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ret = eth_igb_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
 
@@ -1791,6 +1795,35 @@  eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
 
 	intr->mask |= E1000_ICR_LSC;
+	rte_spinlock_init(&(intr->lock));
+
+	return 0;
+}
+
+/*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
 
 	return 0;
 }
@@ -3256,5 +3289,152 @@  static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct e1000_interrupt *intr =
+		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	rte_spinlock_lock(&(intr->lock));
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+	rte_spinlock_unlock(&(intr->lock));
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct e1000_interrupt *intr =
+		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	rte_spinlock_lock(&(intr->lock));
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+	rte_spinlock_unlock(&(intr->lock));
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct e1000_hw *hw)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	uint32_t max_rx_queues = hw->mac.max_rx_queues;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+							E1000_GPIE_PBA | E1000_GPIE_EIAME |
+							E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		intr_mask = 1 << max_rx_queues;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
+
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt is asserted
+	* this saves a register write for every interrupt
+	*/
+	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIAC);
+	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
+		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);