Dear Thomas,
Please check V2 patchset
<https://patchwork.dpdk.org/project/dpdk/patch/20211019104724.19416-1-aman.kumar@vvdntech.in/>.
Upon suggestions from Jerin, we've moved this build option using
config/x86/x86_amd_epyc_linux_gcc cross-file.
This option is not a compilation option and enabled by default in V2, once
it is cross built. Please let us know your comments.
*With Best Regards*
Aman Kumar
VVDN Technologies Pvt. Ltd.
*web:* www.vvdntech.com
On Wed, Oct 13, 2021 at 10:23 PM Thomas Monjalon <thomas@monjalon.net>
wrote:
> 23/08/2021 10:44, Aman Kumar:
> > add non temporal load and temporal store for mprq memcpy.
> > define mlx5_ntload_tstore in meson build configuration to
> > enable this optimization. This utilizes AMD EPYC2 optimized
> > rte_memcpy* routines.
> [...]
> > +option('mlx5_ntload_tstore', type: 'boolean', value: false, description:
> > + 'to enable optimized MPRQ in RX datapath')
>
> Please don't make it a compilation option.
> Why isn't it always enabled?
>
> There was a comment on the first patch.
> Do you plan to make a new version?
>
>
>
@@ -61,6 +61,7 @@ foreach option:cflags_options
cflags += option
endif
endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
if get_option('buildtype').contains('debug')
cflags += [ '-pedantic', '-DPEDANTIC' ]
else
@@ -161,6 +161,11 @@
/* Configure timeout of LRO session (in microseconds). */
#define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
/*
* Device parameter to configure the total data buffer size for a single
* hairpin queue (logarithm value).
@@ -1991,6 +1996,10 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
config->decap_en = !!tmp;
} else if (strcmp(MLX5_ALLOW_DUPLICATE_PATTERN, key) == 0) {
config->allow_duplicate_pattern = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ } else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+ config->mprq_tstore_memcpy = tmp;
+#endif
} else {
DRV_LOG(WARNING, "%s: unknown parameter", key);
rte_errno = EINVAL;
@@ -2051,6 +2060,9 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
MLX5_SYS_MEM_EN,
MLX5_DECAP_EN,
MLX5_ALLOW_DUPLICATE_PATTERN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
NULL,
};
struct rte_kvargs *kvlist;
@@ -298,6 +298,9 @@ struct mlx5_dev_config {
int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
struct mlx5_hca_attr hca_attr; /* HCA attributes. */
struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ unsigned int mprq_tstore_memcpy:1;
+#endif
};
@@ -148,6 +148,9 @@ struct mlx5_rxq_data {
uint32_t rxseg_n; /* Number of split segment descriptions. */
struct mlx5_eth_rxseg rxseg[MLX5_MAX_RXQ_NSEG];
/* Buffer split segment descriptions - sizes, offsets, pools. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ unsigned int mprq_tstore_memcpy:1;
+#endif
} __rte_cache_aligned;
enum mlx5_rxq_type {
@@ -422,6 +425,15 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
const uint32_t offset = strd_idx * strd_sz + strd_shift;
void *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ if (unlikely(!rxq->mprq_tstore_memcpy) &&
+ len <= rxq->mprq_max_memcpy_len) {
+ rte_prefetch1(addr);
+ if (len > RTE_CACHE_LINE_SIZE)
+ rte_prefetch2((void *)((uintptr_t)addr +
+ RTE_CACHE_LINE_SIZE));
+ }
+#endif
/*
* Memcpy packets to the target mbuf if:
* - The size of packet is smaller than mprq_max_memcpy_len.
@@ -433,8 +445,20 @@ mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,
(hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
if (likely(len <=
(uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ uintptr_t data_addr;
+
+ data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+ if (!((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK) &&
+ rxq->mprq_tstore_memcpy)
+ rte_memcpy_aligned_tstore16((void *)data_addr,
+ addr, len);
+ else
+ rte_memcpy((void *)data_addr, addr, len);
+#else
rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
addr, len);
+#endif
DATA_LEN(pkt) = len;
} else if (rxq->strd_scatter_en) {
struct rte_mbuf *prev = pkt;
@@ -1449,6 +1449,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->socket = socket;
if (dev->data->dev_conf.intr_conf.rxq)
tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+ tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
+
/*
* This Rx queue can be configured as a Multi-Packet RQ if all of the
* following conditions are met:
@@ -38,6 +38,8 @@ option('max_lcores', type: 'integer', value: 128, description:
'maximum number of cores/threads supported by EAL')
option('max_numa_nodes', type: 'integer', value: 32, description:
'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false, description:
+ 'to enable optimized MPRQ in RX datapath')
option('platform', type: 'string', value: 'native', description:
'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
option('enable_trace_fp', type: 'boolean', value: false, description: