[dpdk-dev,4/6] i40e: add VMDQ support

Message ID 1411478047-1251-5-git-send-email-jing.d.chen@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Chen, Jing D Sept. 23, 2014, 1:14 p.m. UTC
From: "Chen Jing D(Mark)" <jing.d.chen@intel.com>

The change includes several parts:
1. Get maximum number of VMDQ pools supported in dev_init.
2. Fill VMDQ info in i40e_dev_info_get.
3. Setup VMDQ pools in i40e_dev_configure.
4. i40e_vsi_setup change to support creation of VMDQ VSI.

Signed-off-by: Chen Jing D(Mark) <jing.d.chen@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Jingjing Wu <jingjing.wu@intel.com>
Acked-by: Jijiang Liu <jijiang.liu@intel.com>
Acked-by: Huawei Xie <huawei.xie@intel.com>
---
 config/common_linuxapp            |    1 +
 lib/librte_pmd_i40e/i40e_ethdev.c |  237 ++++++++++++++++++++++++++++++++-----
 lib/librte_pmd_i40e/i40e_ethdev.h |   17 +++-
 3 files changed, 225 insertions(+), 30 deletions(-)
  

Comments

De Lara Guarch, Pablo Oct. 13, 2014, 4:14 p.m. UTC | #1
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Chen Jing D(Mark)
> Sent: Tuesday, September 23, 2014 2:14 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 4/6] i40e: add VMDQ support
> 
> From: "Chen Jing D(Mark)" <jing.d.chen@intel.com>
> 
> The change includes several parts:
> 1. Get maximum number of VMDQ pools supported in dev_init.
> 2. Fill VMDQ info in i40e_dev_info_get.
> 3. Setup VMDQ pools in i40e_dev_configure.
> 4. i40e_vsi_setup change to support creation of VMDQ VSI.
> 
> Signed-off-by: Chen Jing D(Mark) <jing.d.chen@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Acked-by: Jingjing Wu <jingjing.wu@intel.com>
> Acked-by: Jijiang Liu <jijiang.liu@intel.com>
> Acked-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  config/common_linuxapp            |    1 +
>  lib/librte_pmd_i40e/i40e_ethdev.c |  237
> ++++++++++++++++++++++++++++++++-----
>  lib/librte_pmd_i40e/i40e_ethdev.h |   17 +++-
>  3 files changed, 225 insertions(+), 30 deletions(-)
> 
> diff --git a/config/common_linuxapp b/config/common_linuxapp
> index 5bee910..d0bb3f7 100644
> --- a/config/common_linuxapp
> +++ b/config/common_linuxapp
> @@ -208,6 +208,7 @@
> CONFIG_RTE_LIBRTE_I40E_RX_ALLOW_BULK_ALLOC=y
>  CONFIG_RTE_LIBRTE_I40E_ALLOW_UNSUPPORTED_SFP=n
>  CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n
>  CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VF=4
> +CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4

Should we include this option in config_bsdapp as well?

>  # interval up to 8160 us, aligned to 2 (or default value)
>  CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL=-1
> 
> diff --git a/lib/librte_pmd_i40e/i40e_ethdev.c
> b/lib/librte_pmd_i40e/i40e_ethdev.c
> index a00d6ca..a267c96 100644
> --- a/lib/librte_pmd_i40e/i40e_ethdev.c
> +++ b/lib/librte_pmd_i40e/i40e_ethdev.c
> @@ -168,6 +168,7 @@ static int i40e_get_cap(struct i40e_hw *hw);
>  static int i40e_pf_parameter_init(struct rte_eth_dev *dev);
>  static int i40e_pf_setup(struct i40e_pf *pf);
>  static int i40e_vsi_init(struct i40e_vsi *vsi);
> +static int i40e_vmdq_setup(struct rte_eth_dev *dev);
>  static void i40e_stat_update_32(struct i40e_hw *hw, uint32_t reg,
>  		bool offset_loaded, uint64_t *offset, uint64_t *stat);
>  static void i40e_stat_update_48(struct i40e_hw *hw,
> @@ -269,21 +270,11 @@ static struct eth_driver rte_i40e_pmd = {
>  };
> 
>  static inline int
> -i40e_prev_power_of_2(int n)
> +i40e_align_floor(int n)
>  {
> -       int p = n;
> -
> -       --p;
> -       p |= p >> 1;
> -       p |= p >> 2;
> -       p |= p >> 4;
> -       p |= p >> 8;
> -       p |= p >> 16;
> -       if (p == (n - 1))
> -               return n;
> -       p >>= 1;
> -
> -       return ++p;
> +	if (n == 0)
> +		return 0;
> +	return (1 << (sizeof(n) * CHAR_BIT - 1 - __builtin_clz(n)));
>  }
> 
>  static inline int
> @@ -500,7 +491,7 @@ eth_i40e_dev_init(__rte_unused struct eth_driver
> *eth_drv,
>  	if (!dev->data->mac_addrs) {
>  		PMD_INIT_LOG(ERR, "Failed to allocated memory "
>  					"for storing mac address");
> -		goto err_get_mac_addr;
> +		goto err_mac_alloc;
>  	}
>  	ether_addr_copy((struct ether_addr *)hw->mac.perm_addr,
>  					&dev->data->mac_addrs[0]);
> @@ -521,8 +512,9 @@ eth_i40e_dev_init(__rte_unused struct eth_driver
> *eth_drv,
> 
>  	return 0;
> 
> +err_mac_alloc:
> +	i40e_vsi_release(pf->main_vsi);
>  err_setup_pf_switch:
> -	rte_free(pf->main_vsi);
>  err_get_mac_addr:
>  err_configure_lan_hmc:
>  	(void)i40e_shutdown_lan_hmc(hw);
> @@ -541,6 +533,27 @@ err_get_capabilities:
>  static int
>  i40e_dev_configure(struct rte_eth_dev *dev)
>  {
> +	int ret;
> +	enum rte_eth_rx_mq_mode mq_mode = dev->data-
> >dev_conf.rxmode.mq_mode;
> +
> +	/* VMDQ setup.
> +	 *  Needs to move VMDQ setting out of i40e_pf_config_mq_rx() as
> VMDQ and
> +	 *  RSS setting have different requirements.
> +	 *  General PMD driver call sequence are NIC init, configure,
> +	 *  rx/tx_queue_setup and dev_start. In rx/tx_queue_setup()
> function, it
> +	 *  will try to lookup the VSI that specific queue belongs to if VMDQ
> +	 *  applicable. So, VMDQ setting has to be done before
> +	 *  rx/tx_queue_setup(). This function is good  to place vmdq_setup.
> +	 *  For RSS setting, it will try to calculate actual configured RX queue
> +	 *  number, which will be available after rx_queue_setup().
> dev_start()
> +	 *  function is good to place RSS setup.
> +	 */
> +	if (mq_mode & ETH_MQ_RX_VMDQ_FLAG) {
> +		ret = i40e_vmdq_setup(dev);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	return i40e_dev_init_vlan(dev);
>  }
> 
> @@ -1389,6 +1402,16 @@ i40e_dev_info_get(struct rte_eth_dev *dev,
> struct rte_eth_dev_info *dev_info)
>  		DEV_TX_OFFLOAD_UDP_CKSUM |
>  		DEV_TX_OFFLOAD_TCP_CKSUM |
>  		DEV_TX_OFFLOAD_SCTP_CKSUM;
> +
> +	if (pf->flags | I40E_FLAG_VMDQ) {
> +		dev_info->max_vmdq_pools = pf->max_nb_vmdq_vsi;
> +		dev_info->vmdq_queue_base = dev_info->max_rx_queues;
> +		dev_info->vmdq_queue_num = pf->vmdq_nb_qps *
> +						pf->max_nb_vmdq_vsi;
> +		dev_info->vmdq_pool_base = I40E_VMDQ_POOL_BASE;
> +		dev_info->max_rx_queues += dev_info-
> >vmdq_queue_num;
> +		dev_info->max_tx_queues += dev_info-
> >vmdq_queue_num;
> +	}
>  }
> 
>  static int
> @@ -1814,7 +1837,7 @@ i40e_pf_parameter_init(struct rte_eth_dev *dev)
>  {
>  	struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data-
> >dev_private);
>  	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
> -	uint16_t sum_queues = 0, sum_vsis;
> +	uint16_t sum_queues = 0, sum_vsis, left_queues;
> 
>  	/* First check if FW support SRIOV */
>  	if (dev->pci_dev->max_vfs && !hw->func_caps.sr_iov_1_1) {
> @@ -1830,7 +1853,7 @@ i40e_pf_parameter_init(struct rte_eth_dev *dev)
>  		pf->flags |= I40E_FLAG_RSS;
>  		pf->lan_nb_qps = RTE_MIN(hw->func_caps.num_tx_qp,
>  			(uint32_t)(1 << hw-
> >func_caps.rss_table_entry_width));
> -		pf->lan_nb_qps = i40e_prev_power_of_2(pf->lan_nb_qps);
> +		pf->lan_nb_qps = i40e_align_floor(pf->lan_nb_qps);
>  	} else
>  		pf->lan_nb_qps = 1;
>  	sum_queues = pf->lan_nb_qps;
> @@ -1864,11 +1887,19 @@ i40e_pf_parameter_init(struct rte_eth_dev
> *dev)
> 
>  	if (hw->func_caps.vmdq) {
>  		pf->flags |= I40E_FLAG_VMDQ;
> -		pf->vmdq_nb_qps = I40E_DEFAULT_QP_NUM_VMDQ;
> -		sum_queues += pf->vmdq_nb_qps;
> -		sum_vsis += 1;
> -		PMD_INIT_LOG(INFO, "VMDQ queue pairs:%u", pf-
> >vmdq_nb_qps);
> +		pf->vmdq_nb_qps =
> RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM;
> +		pf->max_nb_vmdq_vsi = 1;
> +		/*
> +		 * If VMDQ available, assume a single VSI can be created.  Will
> adjust
> +		 * later.
> +		 */
> +		sum_queues += pf->vmdq_nb_qps * pf-
> >max_nb_vmdq_vsi;
> +		sum_vsis += pf->max_nb_vmdq_vsi;
> +	} else {
> +		pf->vmdq_nb_qps = 0;
> +		pf->max_nb_vmdq_vsi = 0;
>  	}
> +	pf->nb_cfg_vmdq_vsi = 0;
> 
>  	if (hw->func_caps.fd) {
>  		pf->flags |= I40E_FLAG_FDIR;
> @@ -1889,6 +1920,22 @@ i40e_pf_parameter_init(struct rte_eth_dev *dev)
>  		return -EINVAL;
>  	}
> 
> +	/* Adjust VMDQ setting to support as many VMs as possible */
> +	if (pf->flags & I40E_FLAG_VMDQ) {
> +		left_queues = hw->func_caps.num_rx_qp - sum_queues;
> +
> +		pf->max_nb_vmdq_vsi += RTE_MIN(left_queues / pf-
> >vmdq_nb_qps,
> +					pf->max_num_vsi - sum_vsis);
> +
> +		/* Limit the max VMDQ number that rte_ether that can
> support  */
> +		pf->max_nb_vmdq_vsi = RTE_MIN(pf->max_nb_vmdq_vsi,
> +					ETH_64_POOLS - 1);
> +
> +		PMD_INIT_LOG(INFO, "Max VMDQ VSI num:%u",
> +				pf->max_nb_vmdq_vsi);
> +		PMD_INIT_LOG(INFO, "VMDQ queue pairs:%u", pf-
> >vmdq_nb_qps);
> +	}
> +
>  	/* Each VSI occupy 1 MSIX interrupt at least, plus IRQ0 for misc intr
>  	 * cause */
>  	if (sum_vsis > hw->func_caps.num_msix_vectors - 1) {
> @@ -2281,7 +2328,7 @@ i40e_vsi_config_tc_queue_mapping(struct i40e_vsi
> *vsi,
>  	vsi->enabled_tc = enabled_tcmap;
> 
>  	/* Number of queues per enabled TC */
> -	qpnum_per_tc = i40e_prev_power_of_2(vsi->nb_qps / total_tc);
> +	qpnum_per_tc = i40e_align_floor(vsi->nb_qps / total_tc);
>  	qpnum_per_tc = RTE_MIN(qpnum_per_tc, I40E_MAX_Q_PER_TC);
>  	bsf = rte_bsf32(qpnum_per_tc);
> 
> @@ -2587,6 +2634,9 @@ i40e_vsi_setup(struct i40e_pf *pf,
>  	case I40E_VSI_SRIOV :
>  		vsi->nb_qps = pf->vf_nb_qps;
>  		break;
> +	case I40E_VSI_VMDQ2:
> +		vsi->nb_qps = pf->vmdq_nb_qps;
> +		break;
>  	default:
>  		goto fail_mem;
>  	}
> @@ -2728,8 +2778,44 @@ i40e_vsi_setup(struct i40e_pf *pf,
>  		 * Since VSI is not created yet, only configure parameter,
>  		 * will add vsi below.
>  		 */
> -	}
> -	else {
> +	} else if (type == I40E_VSI_VMDQ2) {
> +		memset(&ctxt, 0, sizeof(ctxt));
> +		/*
> +		 * For other VSI, the uplink_seid equals to uplink VSI's
> +		 * uplink_seid since they share same VEB
> +		 */
> +		vsi->uplink_seid = uplink_vsi->uplink_seid;
> +		ctxt.pf_num = hw->pf_id;
> +		ctxt.vf_num = 0;
> +		ctxt.uplink_seid = vsi->uplink_seid;
> +		ctxt.connection_type = 0x1;
> +		ctxt.flags = I40E_AQ_VSI_TYPE_VMDQ2;
> +
> +		ctxt.info.valid_sections |=
> +
> 	rte_cpu_to_le_16(I40E_AQ_VSI_PROP_SWITCH_VALID);
> +		/* user_param carries flag to enable loop back */
> +		if (user_param) {
> +			ctxt.info.switch_id =
> +
> 	rte_cpu_to_le_16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB);
> +			ctxt.info.switch_id |=
> +
> 	rte_cpu_to_le_16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
> +		}
> +
> +		/* Configure port/vlan */
> +		ctxt.info.valid_sections |=
> +
> 	rte_cpu_to_le_16(I40E_AQ_VSI_PROP_VLAN_VALID);
> +		ctxt.info.port_vlan_flags |=
> I40E_AQ_VSI_PVLAN_MODE_ALL;
> +		ret = i40e_vsi_config_tc_queue_mapping(vsi, &ctxt.info,
> +						I40E_DEFAULT_TCMAP);
> +		if (ret != I40E_SUCCESS) {
> +			PMD_DRV_LOG(ERR, "Failed to configure "
> +					"TC queue mapping\n");
> +			goto fail_msix_alloc;
> +		}
> +		ctxt.info.up_enable_bits = I40E_DEFAULT_TCMAP;
> +		ctxt.info.valid_sections |=
> +
> 	rte_cpu_to_le_16(I40E_AQ_VSI_PROP_SCHED_VALID);
> +	} else {
>  		PMD_DRV_LOG(ERR, "VSI: Not support other type VSI yet");
>  		goto fail_msix_alloc;
>  	}
> @@ -2901,7 +2987,6 @@ i40e_pf_setup(struct i40e_pf *pf)
>  {
>  	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
>  	struct i40e_filter_control_settings settings;
> -	struct rte_eth_dev_data *dev_data = pf->dev_data;
>  	struct i40e_vsi *vsi;
>  	int ret;
> 
> @@ -2923,8 +3008,6 @@ i40e_pf_setup(struct i40e_pf *pf)
>  		return I40E_ERR_NOT_READY;
>  	}
>  	pf->main_vsi = vsi;
> -	dev_data->nb_rx_queues = vsi->nb_qps;
> -	dev_data->nb_tx_queues = vsi->nb_qps;
> 
>  	/* Configure filter control */
>  	memset(&settings, 0, sizeof(settings));
> @@ -3195,6 +3278,102 @@ i40e_vsi_init(struct i40e_vsi *vsi)
>  	return err;
>  }
> 
> +static int
> +i40e_vmdq_setup(struct rte_eth_dev *dev)
> +{
> +	struct rte_eth_conf *conf = &dev->data->dev_conf;
> +	struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data-
> >dev_private);
> +	int i, err, conf_vsis, j, loop;
> +	struct i40e_vsi *vsi;
> +	struct i40e_vmdq_info *vmdq_info;
> +	struct rte_eth_vmdq_rx_conf *vmdq_conf;
> +	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
> +
> +	/*
> +	 * Disable interrupt to avoid message from VF. Furthermore, it will
> +	 * avoid race condition in VSI creation/destroy.
> +	 */
> +	i40e_pf_disable_irq0(hw);
> +
> +	if ((pf->flags & I40E_FLAG_VMDQ) == 0) {
> +		PMD_INIT_LOG(ERR, "FW doesn't support VMDQ");
> +		return -ENOTSUP;
> +	}
> +
> +	conf_vsis = conf->rx_adv_conf.vmdq_rx_conf.nb_queue_pools;
> +	if (conf_vsis > pf->max_nb_vmdq_vsi) {
> +		PMD_INIT_LOG(ERR, "VMDQ config: %u, max support:%u",
> +			conf->rx_adv_conf.vmdq_rx_conf.nb_queue_pools,
> +			pf->max_nb_vmdq_vsi);
> +		return -ENOTSUP;
> +	}
> +
> +	if (pf->vmdq != NULL) {
> +		PMD_INIT_LOG(INFO, "VMDQ already configured");
> +		return 0;
> +	}
> +
> +	pf->vmdq = rte_zmalloc("vmdq_info_struct",
> +				sizeof(*vmdq_info) * conf_vsis, 0);
> +
> +	if (pf->vmdq == NULL) {
> +		PMD_INIT_LOG(ERR, "Failed to allocate memory");
> +		return -ENOMEM;
> +	}
> +
> +	vmdq_conf = &conf->rx_adv_conf.vmdq_rx_conf;
> +
> +	/* Create VMDQ VSI */
> +	for (i = 0; i < conf_vsis; i++) {
> +		vsi = i40e_vsi_setup(pf, I40E_VSI_VMDQ2, pf->main_vsi,
> +				vmdq_conf->enable_loop_back);
> +		if (vsi == NULL) {
> +			PMD_INIT_LOG(ERR, "Failed to create VMDQ VSI");
> +			err = -1;
> +			goto err_vsi_setup;
> +		}
> +		vmdq_info = &pf->vmdq[i];
> +		vmdq_info->pf = pf;
> +		vmdq_info->vsi = vsi;
> +	}
> +	pf->nb_cfg_vmdq_vsi = conf_vsis;
> +
> +	/* Configure Vlan */
> +	loop = sizeof(vmdq_conf->pool_map[0].pools) * CHAR_BIT;
> +	for (i = 0; i < vmdq_conf->nb_pool_maps; i++) {
> +		for (j = 0; j < loop && j < pf->nb_cfg_vmdq_vsi; j++) {
> +			if (vmdq_conf->pool_map[i].pools & (1UL << j)) {
> +				PMD_INIT_LOG(INFO, "Add vlan %u to vmdq
> pool %u",
> +					vmdq_conf->pool_map[i].vlan_id, j);
> +
> +				err = i40e_vsi_add_vlan(pf->vmdq[j].vsi,
> +						vmdq_conf-
> >pool_map[i].vlan_id);
> +				if (err) {
> +					PMD_INIT_LOG(ERR, "Failed to add
> vlan");
> +					err = -1;
> +					goto err_vsi_setup;
> +				}
> +			}
> +		}
> +	}
> +
> +	i40e_pf_enable_irq0(hw);
> +
> +	return 0;
> +
> +err_vsi_setup:
> +	for (i = 0; i < conf_vsis; i++)
> +		if (pf->vmdq[i].vsi == NULL)
> +			break;
> +		else
> +			i40e_vsi_release(pf->vmdq[i].vsi);
> +
> +	rte_free(pf->vmdq);
> +	pf->vmdq = NULL;
> +	i40e_pf_enable_irq0(hw);
> +	return err;
> +}
> +
>  static void
>  i40e_stat_update_32(struct i40e_hw *hw,
>  		   uint32_t reg,
> @@ -4086,7 +4265,7 @@ i40e_pf_config_rss(struct i40e_pf *pf)
>  	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
>  	struct rte_eth_rss_conf rss_conf;
>  	uint32_t i, lut = 0;
> -	uint16_t j, num = i40e_prev_power_of_2(pf->dev_data-
> >nb_rx_queues);
> +	uint16_t j, num = i40e_align_floor(pf->dev_data->nb_rx_queues);
> 
>  	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
>  		if (j == num)
> diff --git a/lib/librte_pmd_i40e/i40e_ethdev.h
> b/lib/librte_pmd_i40e/i40e_ethdev.h
> index 64deef2..b06de05 100644
> --- a/lib/librte_pmd_i40e/i40e_ethdev.h
> +++ b/lib/librte_pmd_i40e/i40e_ethdev.h
> @@ -45,13 +45,15 @@
>  #define I40E_QUEUE_BASE_ADDR_UNIT 128
>  /* number of VSIs and queue default setting */
>  #define I40E_MAX_QP_NUM_PER_VF    16
> -#define I40E_DEFAULT_QP_NUM_VMDQ  64
>  #define I40E_DEFAULT_QP_NUM_FDIR  64
>  #define I40E_UINT32_BIT_SIZE      (CHAR_BIT * sizeof(uint32_t))
>  #define I40E_VFTA_SIZE            (4096 / I40E_UINT32_BIT_SIZE)
>  /* Default TC traffic in case DCB is not enabled */
>  #define I40E_DEFAULT_TCMAP        0x1
> 
> +/* Always assign pool 0 to main VSI, VMDQ will start from 1 */
> +#define I40E_VMDQ_POOL_BASE       1
> +
>  /* i40e flags */
>  #define I40E_FLAG_RSS                   (1ULL << 0)
>  #define I40E_FLAG_DCB                   (1ULL << 1)
> @@ -189,6 +191,14 @@ struct i40e_pf_vf {
>  };
> 
>  /*
> + * Structure to store private data for VMDQ instance
> + */
> +struct i40e_vmdq_info {
> +	struct i40e_pf *pf;
> +	struct i40e_vsi *vsi;
> +};
> +
> +/*
>   * Structure to store private data specific for PF instance.
>   */
>  struct i40e_pf {
> @@ -216,6 +226,11 @@ struct i40e_pf {
>  	uint16_t vmdq_nb_qps; /* The number of queue pairs of VMDq */
>  	uint16_t vf_nb_qps; /* The number of queue pairs of VF */
>  	uint16_t fdir_nb_qps; /* The number of queue pairs of Flow Director
> */
> +
> +	/* VMDQ related info */
> +	uint16_t max_nb_vmdq_vsi; /* Max number of VMDQ VSIs
> supported */
> +	uint16_t nb_cfg_vmdq_vsi; /* number of VMDQ VSIs configured */
> +	struct i40e_vmdq_info *vmdq;
>  };
> 
>  enum pending_msg {
> --
> 1.7.7.6
  

Patch

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 5bee910..d0bb3f7 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -208,6 +208,7 @@  CONFIG_RTE_LIBRTE_I40E_RX_ALLOW_BULK_ALLOC=y
 CONFIG_RTE_LIBRTE_I40E_ALLOW_UNSUPPORTED_SFP=n
 CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n
 CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VF=4
+CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4
 # interval up to 8160 us, aligned to 2 (or default value)
 CONFIG_RTE_LIBRTE_I40E_ITR_INTERVAL=-1
 
diff --git a/lib/librte_pmd_i40e/i40e_ethdev.c b/lib/librte_pmd_i40e/i40e_ethdev.c
index a00d6ca..a267c96 100644
--- a/lib/librte_pmd_i40e/i40e_ethdev.c
+++ b/lib/librte_pmd_i40e/i40e_ethdev.c
@@ -168,6 +168,7 @@  static int i40e_get_cap(struct i40e_hw *hw);
 static int i40e_pf_parameter_init(struct rte_eth_dev *dev);
 static int i40e_pf_setup(struct i40e_pf *pf);
 static int i40e_vsi_init(struct i40e_vsi *vsi);
+static int i40e_vmdq_setup(struct rte_eth_dev *dev);
 static void i40e_stat_update_32(struct i40e_hw *hw, uint32_t reg,
 		bool offset_loaded, uint64_t *offset, uint64_t *stat);
 static void i40e_stat_update_48(struct i40e_hw *hw,
@@ -269,21 +270,11 @@  static struct eth_driver rte_i40e_pmd = {
 };
 
 static inline int
-i40e_prev_power_of_2(int n)
+i40e_align_floor(int n)
 {
-       int p = n;
-
-       --p;
-       p |= p >> 1;
-       p |= p >> 2;
-       p |= p >> 4;
-       p |= p >> 8;
-       p |= p >> 16;
-       if (p == (n - 1))
-               return n;
-       p >>= 1;
-
-       return ++p;
+	if (n == 0)
+		return 0;
+	return (1 << (sizeof(n) * CHAR_BIT - 1 - __builtin_clz(n)));
 }
 
 static inline int
@@ -500,7 +491,7 @@  eth_i40e_dev_init(__rte_unused struct eth_driver *eth_drv,
 	if (!dev->data->mac_addrs) {
 		PMD_INIT_LOG(ERR, "Failed to allocated memory "
 					"for storing mac address");
-		goto err_get_mac_addr;
+		goto err_mac_alloc;
 	}
 	ether_addr_copy((struct ether_addr *)hw->mac.perm_addr,
 					&dev->data->mac_addrs[0]);
@@ -521,8 +512,9 @@  eth_i40e_dev_init(__rte_unused struct eth_driver *eth_drv,
 
 	return 0;
 
+err_mac_alloc:
+	i40e_vsi_release(pf->main_vsi);
 err_setup_pf_switch:
-	rte_free(pf->main_vsi);
 err_get_mac_addr:
 err_configure_lan_hmc:
 	(void)i40e_shutdown_lan_hmc(hw);
@@ -541,6 +533,27 @@  err_get_capabilities:
 static int
 i40e_dev_configure(struct rte_eth_dev *dev)
 {
+	int ret;
+	enum rte_eth_rx_mq_mode mq_mode = dev->data->dev_conf.rxmode.mq_mode;
+
+	/* VMDQ setup.
+	 *  Needs to move VMDQ setting out of i40e_pf_config_mq_rx() as VMDQ and
+	 *  RSS setting have different requirements.
+	 *  General PMD driver call sequence are NIC init, configure,
+	 *  rx/tx_queue_setup and dev_start. In rx/tx_queue_setup() function, it
+	 *  will try to lookup the VSI that specific queue belongs to if VMDQ
+	 *  applicable. So, VMDQ setting has to be done before
+	 *  rx/tx_queue_setup(). This function is good  to place vmdq_setup.
+	 *  For RSS setting, it will try to calculate actual configured RX queue
+	 *  number, which will be available after rx_queue_setup(). dev_start()
+	 *  function is good to place RSS setup.
+	 */
+	if (mq_mode & ETH_MQ_RX_VMDQ_FLAG) {
+		ret = i40e_vmdq_setup(dev);
+		if (ret)
+			return ret;
+	}
+
 	return i40e_dev_init_vlan(dev);
 }
 
@@ -1389,6 +1402,16 @@  i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 		DEV_TX_OFFLOAD_UDP_CKSUM |
 		DEV_TX_OFFLOAD_TCP_CKSUM |
 		DEV_TX_OFFLOAD_SCTP_CKSUM;
+
+	if (pf->flags | I40E_FLAG_VMDQ) {
+		dev_info->max_vmdq_pools = pf->max_nb_vmdq_vsi;
+		dev_info->vmdq_queue_base = dev_info->max_rx_queues;
+		dev_info->vmdq_queue_num = pf->vmdq_nb_qps *
+						pf->max_nb_vmdq_vsi;
+		dev_info->vmdq_pool_base = I40E_VMDQ_POOL_BASE;
+		dev_info->max_rx_queues += dev_info->vmdq_queue_num;
+		dev_info->max_tx_queues += dev_info->vmdq_queue_num;
+	}
 }
 
 static int
@@ -1814,7 +1837,7 @@  i40e_pf_parameter_init(struct rte_eth_dev *dev)
 {
 	struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data->dev_private);
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
-	uint16_t sum_queues = 0, sum_vsis;
+	uint16_t sum_queues = 0, sum_vsis, left_queues;
 
 	/* First check if FW support SRIOV */
 	if (dev->pci_dev->max_vfs && !hw->func_caps.sr_iov_1_1) {
@@ -1830,7 +1853,7 @@  i40e_pf_parameter_init(struct rte_eth_dev *dev)
 		pf->flags |= I40E_FLAG_RSS;
 		pf->lan_nb_qps = RTE_MIN(hw->func_caps.num_tx_qp,
 			(uint32_t)(1 << hw->func_caps.rss_table_entry_width));
-		pf->lan_nb_qps = i40e_prev_power_of_2(pf->lan_nb_qps);
+		pf->lan_nb_qps = i40e_align_floor(pf->lan_nb_qps);
 	} else
 		pf->lan_nb_qps = 1;
 	sum_queues = pf->lan_nb_qps;
@@ -1864,11 +1887,19 @@  i40e_pf_parameter_init(struct rte_eth_dev *dev)
 
 	if (hw->func_caps.vmdq) {
 		pf->flags |= I40E_FLAG_VMDQ;
-		pf->vmdq_nb_qps = I40E_DEFAULT_QP_NUM_VMDQ;
-		sum_queues += pf->vmdq_nb_qps;
-		sum_vsis += 1;
-		PMD_INIT_LOG(INFO, "VMDQ queue pairs:%u", pf->vmdq_nb_qps);
+		pf->vmdq_nb_qps = RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM;
+		pf->max_nb_vmdq_vsi = 1;
+		/*
+		 * If VMDQ available, assume a single VSI can be created.  Will adjust
+		 * later.
+		 */
+		sum_queues += pf->vmdq_nb_qps * pf->max_nb_vmdq_vsi;
+		sum_vsis += pf->max_nb_vmdq_vsi;
+	} else {
+		pf->vmdq_nb_qps = 0;
+		pf->max_nb_vmdq_vsi = 0;
 	}
+	pf->nb_cfg_vmdq_vsi = 0;
 
 	if (hw->func_caps.fd) {
 		pf->flags |= I40E_FLAG_FDIR;
@@ -1889,6 +1920,22 @@  i40e_pf_parameter_init(struct rte_eth_dev *dev)
 		return -EINVAL;
 	}
 
+	/* Adjust VMDQ setting to support as many VMs as possible */
+	if (pf->flags & I40E_FLAG_VMDQ) {
+		left_queues = hw->func_caps.num_rx_qp - sum_queues;
+
+		pf->max_nb_vmdq_vsi += RTE_MIN(left_queues / pf->vmdq_nb_qps,
+					pf->max_num_vsi - sum_vsis);
+
+		/* Limit the max VMDQ number that rte_ether that can support  */
+		pf->max_nb_vmdq_vsi = RTE_MIN(pf->max_nb_vmdq_vsi,
+					ETH_64_POOLS - 1);
+
+		PMD_INIT_LOG(INFO, "Max VMDQ VSI num:%u",
+				pf->max_nb_vmdq_vsi);
+		PMD_INIT_LOG(INFO, "VMDQ queue pairs:%u", pf->vmdq_nb_qps);
+	}
+
 	/* Each VSI occupy 1 MSIX interrupt at least, plus IRQ0 for misc intr
 	 * cause */
 	if (sum_vsis > hw->func_caps.num_msix_vectors - 1) {
@@ -2281,7 +2328,7 @@  i40e_vsi_config_tc_queue_mapping(struct i40e_vsi *vsi,
 	vsi->enabled_tc = enabled_tcmap;
 
 	/* Number of queues per enabled TC */
-	qpnum_per_tc = i40e_prev_power_of_2(vsi->nb_qps / total_tc);
+	qpnum_per_tc = i40e_align_floor(vsi->nb_qps / total_tc);
 	qpnum_per_tc = RTE_MIN(qpnum_per_tc, I40E_MAX_Q_PER_TC);
 	bsf = rte_bsf32(qpnum_per_tc);
 
@@ -2587,6 +2634,9 @@  i40e_vsi_setup(struct i40e_pf *pf,
 	case I40E_VSI_SRIOV :
 		vsi->nb_qps = pf->vf_nb_qps;
 		break;
+	case I40E_VSI_VMDQ2:
+		vsi->nb_qps = pf->vmdq_nb_qps;
+		break;
 	default:
 		goto fail_mem;
 	}
@@ -2728,8 +2778,44 @@  i40e_vsi_setup(struct i40e_pf *pf,
 		 * Since VSI is not created yet, only configure parameter,
 		 * will add vsi below.
 		 */
-	}
-	else {
+	} else if (type == I40E_VSI_VMDQ2) {
+		memset(&ctxt, 0, sizeof(ctxt));
+		/*
+		 * For other VSI, the uplink_seid equals to uplink VSI's
+		 * uplink_seid since they share same VEB
+		 */
+		vsi->uplink_seid = uplink_vsi->uplink_seid;
+		ctxt.pf_num = hw->pf_id;
+		ctxt.vf_num = 0;
+		ctxt.uplink_seid = vsi->uplink_seid;
+		ctxt.connection_type = 0x1;
+		ctxt.flags = I40E_AQ_VSI_TYPE_VMDQ2;
+
+		ctxt.info.valid_sections |=
+				rte_cpu_to_le_16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+		/* user_param carries flag to enable loop back */
+		if (user_param) {
+			ctxt.info.switch_id =
+			rte_cpu_to_le_16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB);
+			ctxt.info.switch_id |=
+			rte_cpu_to_le_16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+		}
+
+		/* Configure port/vlan */
+		ctxt.info.valid_sections |=
+			rte_cpu_to_le_16(I40E_AQ_VSI_PROP_VLAN_VALID);
+		ctxt.info.port_vlan_flags |= I40E_AQ_VSI_PVLAN_MODE_ALL;
+		ret = i40e_vsi_config_tc_queue_mapping(vsi, &ctxt.info,
+						I40E_DEFAULT_TCMAP);
+		if (ret != I40E_SUCCESS) {
+			PMD_DRV_LOG(ERR, "Failed to configure "
+					"TC queue mapping\n");
+			goto fail_msix_alloc;
+		}
+		ctxt.info.up_enable_bits = I40E_DEFAULT_TCMAP;
+		ctxt.info.valid_sections |=
+			rte_cpu_to_le_16(I40E_AQ_VSI_PROP_SCHED_VALID);
+	} else {
 		PMD_DRV_LOG(ERR, "VSI: Not support other type VSI yet");
 		goto fail_msix_alloc;
 	}
@@ -2901,7 +2987,6 @@  i40e_pf_setup(struct i40e_pf *pf)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	struct i40e_filter_control_settings settings;
-	struct rte_eth_dev_data *dev_data = pf->dev_data;
 	struct i40e_vsi *vsi;
 	int ret;
 
@@ -2923,8 +3008,6 @@  i40e_pf_setup(struct i40e_pf *pf)
 		return I40E_ERR_NOT_READY;
 	}
 	pf->main_vsi = vsi;
-	dev_data->nb_rx_queues = vsi->nb_qps;
-	dev_data->nb_tx_queues = vsi->nb_qps;
 
 	/* Configure filter control */
 	memset(&settings, 0, sizeof(settings));
@@ -3195,6 +3278,102 @@  i40e_vsi_init(struct i40e_vsi *vsi)
 	return err;
 }
 
+static int
+i40e_vmdq_setup(struct rte_eth_dev *dev)
+{
+	struct rte_eth_conf *conf = &dev->data->dev_conf;
+	struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data->dev_private);
+	int i, err, conf_vsis, j, loop;
+	struct i40e_vsi *vsi;
+	struct i40e_vmdq_info *vmdq_info;
+	struct rte_eth_vmdq_rx_conf *vmdq_conf;
+	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
+
+	/*
+	 * Disable interrupt to avoid message from VF. Furthermore, it will
+	 * avoid race condition in VSI creation/destroy.
+	 */
+	i40e_pf_disable_irq0(hw);
+
+	if ((pf->flags & I40E_FLAG_VMDQ) == 0) {
+		PMD_INIT_LOG(ERR, "FW doesn't support VMDQ");
+		return -ENOTSUP;
+	}
+
+	conf_vsis = conf->rx_adv_conf.vmdq_rx_conf.nb_queue_pools;
+	if (conf_vsis > pf->max_nb_vmdq_vsi) {
+		PMD_INIT_LOG(ERR, "VMDQ config: %u, max support:%u",
+			conf->rx_adv_conf.vmdq_rx_conf.nb_queue_pools,
+			pf->max_nb_vmdq_vsi);
+		return -ENOTSUP;
+	}
+
+	if (pf->vmdq != NULL) {
+		PMD_INIT_LOG(INFO, "VMDQ already configured");
+		return 0;
+	}
+
+	pf->vmdq = rte_zmalloc("vmdq_info_struct",
+				sizeof(*vmdq_info) * conf_vsis, 0);
+
+	if (pf->vmdq == NULL) {
+		PMD_INIT_LOG(ERR, "Failed to allocate memory");
+		return -ENOMEM;
+	}
+
+	vmdq_conf = &conf->rx_adv_conf.vmdq_rx_conf;
+
+	/* Create VMDQ VSI */
+	for (i = 0; i < conf_vsis; i++) {
+		vsi = i40e_vsi_setup(pf, I40E_VSI_VMDQ2, pf->main_vsi,
+				vmdq_conf->enable_loop_back);
+		if (vsi == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to create VMDQ VSI");
+			err = -1;
+			goto err_vsi_setup;
+		}
+		vmdq_info = &pf->vmdq[i];
+		vmdq_info->pf = pf;
+		vmdq_info->vsi = vsi;
+	}
+	pf->nb_cfg_vmdq_vsi = conf_vsis;
+
+	/* Configure Vlan */
+	loop = sizeof(vmdq_conf->pool_map[0].pools) * CHAR_BIT;
+	for (i = 0; i < vmdq_conf->nb_pool_maps; i++) {
+		for (j = 0; j < loop && j < pf->nb_cfg_vmdq_vsi; j++) {
+			if (vmdq_conf->pool_map[i].pools & (1UL << j)) {
+				PMD_INIT_LOG(INFO, "Add vlan %u to vmdq pool %u",
+					vmdq_conf->pool_map[i].vlan_id, j);
+
+				err = i40e_vsi_add_vlan(pf->vmdq[j].vsi,
+						vmdq_conf->pool_map[i].vlan_id);
+				if (err) {
+					PMD_INIT_LOG(ERR, "Failed to add vlan");
+					err = -1;
+					goto err_vsi_setup;
+				}
+			}
+		}
+	}
+
+	i40e_pf_enable_irq0(hw);
+
+	return 0;
+
+err_vsi_setup:
+	for (i = 0; i < conf_vsis; i++)
+		if (pf->vmdq[i].vsi == NULL)
+			break;
+		else
+			i40e_vsi_release(pf->vmdq[i].vsi);
+
+	rte_free(pf->vmdq);
+	pf->vmdq = NULL;
+	i40e_pf_enable_irq0(hw);
+	return err;
+}
+
 static void
 i40e_stat_update_32(struct i40e_hw *hw,
 		   uint32_t reg,
@@ -4086,7 +4265,7 @@  i40e_pf_config_rss(struct i40e_pf *pf)
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	struct rte_eth_rss_conf rss_conf;
 	uint32_t i, lut = 0;
-	uint16_t j, num = i40e_prev_power_of_2(pf->dev_data->nb_rx_queues);
+	uint16_t j, num = i40e_align_floor(pf->dev_data->nb_rx_queues);
 
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
diff --git a/lib/librte_pmd_i40e/i40e_ethdev.h b/lib/librte_pmd_i40e/i40e_ethdev.h
index 64deef2..b06de05 100644
--- a/lib/librte_pmd_i40e/i40e_ethdev.h
+++ b/lib/librte_pmd_i40e/i40e_ethdev.h
@@ -45,13 +45,15 @@ 
 #define I40E_QUEUE_BASE_ADDR_UNIT 128
 /* number of VSIs and queue default setting */
 #define I40E_MAX_QP_NUM_PER_VF    16
-#define I40E_DEFAULT_QP_NUM_VMDQ  64
 #define I40E_DEFAULT_QP_NUM_FDIR  64
 #define I40E_UINT32_BIT_SIZE      (CHAR_BIT * sizeof(uint32_t))
 #define I40E_VFTA_SIZE            (4096 / I40E_UINT32_BIT_SIZE)
 /* Default TC traffic in case DCB is not enabled */
 #define I40E_DEFAULT_TCMAP        0x1
 
+/* Always assign pool 0 to main VSI, VMDQ will start from 1 */
+#define I40E_VMDQ_POOL_BASE       1
+
 /* i40e flags */
 #define I40E_FLAG_RSS                   (1ULL << 0)
 #define I40E_FLAG_DCB                   (1ULL << 1)
@@ -189,6 +191,14 @@  struct i40e_pf_vf {
 };
 
 /*
+ * Structure to store private data for VMDQ instance
+ */
+struct i40e_vmdq_info {
+	struct i40e_pf *pf;
+	struct i40e_vsi *vsi;
+};
+
+/*
  * Structure to store private data specific for PF instance.
  */
 struct i40e_pf {
@@ -216,6 +226,11 @@  struct i40e_pf {
 	uint16_t vmdq_nb_qps; /* The number of queue pairs of VMDq */
 	uint16_t vf_nb_qps; /* The number of queue pairs of VF */
 	uint16_t fdir_nb_qps; /* The number of queue pairs of Flow Director */
+
+	/* VMDQ related info */
+	uint16_t max_nb_vmdq_vsi; /* Max number of VMDQ VSIs supported */
+	uint16_t nb_cfg_vmdq_vsi; /* number of VMDQ VSIs configured */
+	struct i40e_vmdq_info *vmdq;
 };
 
 enum pending_msg {