cryptodev: add a simple mapping cache to speed up ops pool create

Message ID 20240119164122.11829-1-andrew.boyer@amd.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series cryptodev: add a simple mapping cache to speed up ops pool create |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/github-robot: build success github build: passed
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-unit-arm64-testing fail Testing issues
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS

Commit Message

Boyer, Andrew Jan. 19, 2024, 4:41 p.m. UTC
  Cache the most recent VA -> PA mapping found so that we can skip
most of the system calls. With 4K pages this reduces pool create
time by about 90%.

Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
---
 lib/cryptodev/rte_crypto.h    |  5 +++++
 lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)
  

Comments

Akhil Goyal Feb. 6, 2024, 2:24 p.m. UTC | #1
> Cache the most recent VA -> PA mapping found so that we can skip
> most of the system calls. With 4K pages this reduces pool create
> time by about 90%.
> 
> Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>

I believe there should be a generic solution for this in mempool
 if it is not there already.
Here, you are adding cache in mempool priv
which does not seem to be a correct place.
This optimization would be needed across all types of mempools.
Adding more people for comments.


> ---
>  lib/cryptodev/rte_crypto.h    |  5 +++++
>  lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
>  2 files changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
> index dbc2700da5..ee6aa1e40e 100644
> --- a/lib/cryptodev/rte_crypto.h
> +++ b/lib/cryptodev/rte_crypto.h
> @@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
>  	/**< Crypto op pool type operation. */
>  	uint16_t priv_size;
>  	/**< Size of private area in each crypto operation. */
> +
> +	unsigned long vp_cache;
> +	/* Virtual page address of previous op. */
> +	rte_iova_t iovp_cache;
> +	/* I/O virtual page address of previous op. */
>  };
> 
> 
> diff --git a/lib/cryptodev/rte_cryptodev.c b/lib/cryptodev/rte_cryptodev.c
> index b233c0ecd7..d596f85a57 100644
> --- a/lib/cryptodev/rte_cryptodev.c
> +++ b/lib/cryptodev/rte_cryptodev.c
> @@ -10,6 +10,7 @@
>  #include <errno.h>
>  #include <stdint.h>
>  #include <inttypes.h>
> +#include <unistd.h>
> 
>  #include <rte_log.h>
>  #include <rte_debug.h>
> @@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool *mempool,
>  {
>  	struct rte_crypto_op *op = _op_data;
>  	enum rte_crypto_op_type type = *(enum rte_crypto_op_type
> *)opaque_arg;
> +	struct rte_crypto_op_pool_private *priv;
> +	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
> +#ifdef RTE_EXEC_ENV_WINDOWS
> +	unsigned long page_mask = 4095;
> +#else
> +	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
> +#endif
> +	unsigned long virt_page = virt_addr & ~page_mask;
> 
>  	memset(_op_data, 0, mempool->elt_size);
> 
>  	__rte_crypto_op_reset(op, type);
> 
> -	op->phys_addr = rte_mem_virt2iova(_op_data);
> +	priv = (struct rte_crypto_op_pool_private *)
> +		rte_mempool_get_priv(mempool);
> +
> +	if (virt_page == priv->vp_cache) {
> +		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
> +	} else {
> +		op->phys_addr = rte_mem_virt2iova(_op_data);
> +
> +		/* Update cached values */
> +		priv->vp_cache = virt_page;
> +		priv->iovp_cache = op->phys_addr & ~page_mask;
> +	}
> +
>  	op->mempool = mempool;
>  }
> 
> --
> 2.17.1
  
Morten Brørup Feb. 7, 2024, 2:24 a.m. UTC | #2
> From: Akhil Goyal [mailto:gakhil@marvell.com]
> Sent: Tuesday, 6 February 2024 15.25
> 
> > Cache the most recent VA -> PA mapping found so that we can skip
> > most of the system calls. With 4K pages this reduces pool create
> > time by about 90%.
> >
> > Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
> 
> I believe there should be a generic solution for this in mempool
>  if it is not there already.
> Here, you are adding cache in mempool priv
> which does not seem to be a correct place.
> This optimization would be needed across all types of mempools.
> Adding more people for comments.
> 
> 
> > ---
> >  lib/cryptodev/rte_crypto.h    |  5 +++++
> >  lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
> >  2 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
> > index dbc2700da5..ee6aa1e40e 100644
> > --- a/lib/cryptodev/rte_crypto.h
> > +++ b/lib/cryptodev/rte_crypto.h
> > @@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
> >  	/**< Crypto op pool type operation. */
> >  	uint16_t priv_size;
> >  	/**< Size of private area in each crypto operation. */
> > +
> > +	unsigned long vp_cache;
> > +	/* Virtual page address of previous op. */
> > +	rte_iova_t iovp_cache;
> > +	/* I/O virtual page address of previous op. */
> >  };
> >
> >
> > diff --git a/lib/cryptodev/rte_cryptodev.c
> b/lib/cryptodev/rte_cryptodev.c
> > index b233c0ecd7..d596f85a57 100644
> > --- a/lib/cryptodev/rte_cryptodev.c
> > +++ b/lib/cryptodev/rte_cryptodev.c
> > @@ -10,6 +10,7 @@
> >  #include <errno.h>
> >  #include <stdint.h>
> >  #include <inttypes.h>
> > +#include <unistd.h>
> >
> >  #include <rte_log.h>
> >  #include <rte_debug.h>
> > @@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool
> *mempool,
> >  {
> >  	struct rte_crypto_op *op = _op_data;
> >  	enum rte_crypto_op_type type = *(enum rte_crypto_op_type
> > *)opaque_arg;
> > +	struct rte_crypto_op_pool_private *priv;
> > +	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
> > +#ifdef RTE_EXEC_ENV_WINDOWS
> > +	unsigned long page_mask = 4095;
> > +#else
> > +	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
> > +#endif
> > +	unsigned long virt_page = virt_addr & ~page_mask;
> >
> >  	memset(_op_data, 0, mempool->elt_size);
> >
> >  	__rte_crypto_op_reset(op, type);
> >
> > -	op->phys_addr = rte_mem_virt2iova(_op_data);

This optimization is for rte_mem_virt2iova(_op_data) being slow.

If I'm not mistaken, _op_data is an object in a mempool, where the mempool object headers have already been initialized.

In this case, it could simply be optimized as this:
-	op->phys_addr = rte_mem_virt2iova(_op_data);
+	op->phys_addr = rte_mempool_virt2iova(_op_data);

Now going down a rat hole...

If the above is true, I wonder if struct rte_crypto_op is only instantiated as objects in mempools... If it is, the op->mempool and op->phys_addr fields are fundamentally redundant, and can be retrieved from the mempool object header instead:
op->phys_addr === rte_mempool_virt2iova(op)
op->mempool === rte_mempool_from_obj(op)

Having these shadow variables in struct rte_crypto_op may provide performance benefits when RTE_MEMPOOL_F_NO_CACHE_ALIGN is not set on the mempool, the mempool object header is in the preceding cache line of the mempool object (containing the struct rte_crypto_op op).

A better solution than these shadow variables would be to introduce another mempool flag to cache align the mempool object header, but let the mempool object itself directly follow the mempool object header, so the mempool object header and the mempool object itself (if small enough) reside in the same cache line. This would also use less memory.

> > +	priv = (struct rte_crypto_op_pool_private *)
> > +		rte_mempool_get_priv(mempool);
> > +
> > +	if (virt_page == priv->vp_cache) {
> > +		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
> > +	} else {
> > +		op->phys_addr = rte_mem_virt2iova(_op_data);
> > +
> > +		/* Update cached values */
> > +		priv->vp_cache = virt_page;
> > +		priv->iovp_cache = op->phys_addr & ~page_mask;
> > +	}
> > +
> >  	op->mempool = mempool;
> >  }
> >
> > --
> > 2.17.1
  
Boyer, Andrew Feb. 7, 2024, 3:02 a.m. UTC | #3
On Feb 6, 2024, at 9:24 PM, Morten Brørup <mb@smartsharesystems.com> wrote:

Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.


From: Akhil Goyal [mailto:gakhil@marvell.com]
Sent: Tuesday, 6 February 2024 15.25

Cache the most recent VA -> PA mapping found so that we can skip
most of the system calls. With 4K pages this reduces pool create
time by about 90%.

Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>

I believe there should be a generic solution for this in mempool
if it is not there already.
Here, you are adding cache in mempool priv
which does not seem to be a correct place.
This optimization would be needed across all types of mempools.
Adding more people for comments.


---
lib/cryptodev/rte_crypto.h    |  5 +++++
lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
index dbc2700da5..ee6aa1e40e 100644
--- a/lib/cryptodev/rte_crypto.h
+++ b/lib/cryptodev/rte_crypto.h
@@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
   /**< Crypto op pool type operation. */
   uint16_t priv_size;
   /**< Size of private area in each crypto operation. */
+
+   unsigned long vp_cache;
+   /* Virtual page address of previous op. */
+   rte_iova_t iovp_cache;
+   /* I/O virtual page address of previous op. */
};


diff --git a/lib/cryptodev/rte_cryptodev.c
b/lib/cryptodev/rte_cryptodev.c
index b233c0ecd7..d596f85a57 100644
--- a/lib/cryptodev/rte_cryptodev.c
+++ b/lib/cryptodev/rte_cryptodev.c
@@ -10,6 +10,7 @@
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>
+#include <unistd.h>

#include <rte_log.h>
#include <rte_debug.h>
@@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool
*mempool,
{
   struct rte_crypto_op *op = _op_data;
   enum rte_crypto_op_type type = *(enum rte_crypto_op_type
*)opaque_arg;
+   struct rte_crypto_op_pool_private *priv;
+   unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
+#ifdef RTE_EXEC_ENV_WINDOWS
+   unsigned long page_mask = 4095;
+#else
+   unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
+#endif
+   unsigned long virt_page = virt_addr & ~page_mask;

   memset(_op_data, 0, mempool->elt_size);

   __rte_crypto_op_reset(op, type);

-   op->phys_addr = rte_mem_virt2iova(_op_data);

This optimization is for rte_mem_virt2iova(_op_data) being slow.

If I'm not mistaken, _op_data is an object in a mempool, where the mempool object headers have already been initialized.

In this case, it could simply be optimized as this:
-       op->phys_addr = rte_mem_virt2iova(_op_data);
+       op->phys_addr = rte_mempool_virt2iova(_op_data);


That certainly is shorter! Thanks, I was not aware of this function.

-Andrew
  

Patch

diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
index dbc2700da5..ee6aa1e40e 100644
--- a/lib/cryptodev/rte_crypto.h
+++ b/lib/cryptodev/rte_crypto.h
@@ -220,6 +220,11 @@  struct rte_crypto_op_pool_private {
 	/**< Crypto op pool type operation. */
 	uint16_t priv_size;
 	/**< Size of private area in each crypto operation. */
+
+	unsigned long vp_cache;
+	/* Virtual page address of previous op. */
+	rte_iova_t iovp_cache;
+	/* I/O virtual page address of previous op. */
 };
 
 
diff --git a/lib/cryptodev/rte_cryptodev.c b/lib/cryptodev/rte_cryptodev.c
index b233c0ecd7..d596f85a57 100644
--- a/lib/cryptodev/rte_cryptodev.c
+++ b/lib/cryptodev/rte_cryptodev.c
@@ -10,6 +10,7 @@ 
 #include <errno.h>
 #include <stdint.h>
 #include <inttypes.h>
+#include <unistd.h>
 
 #include <rte_log.h>
 #include <rte_debug.h>
@@ -2568,12 +2569,32 @@  rte_crypto_op_init(struct rte_mempool *mempool,
 {
 	struct rte_crypto_op *op = _op_data;
 	enum rte_crypto_op_type type = *(enum rte_crypto_op_type *)opaque_arg;
+	struct rte_crypto_op_pool_private *priv;
+	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
+#ifdef RTE_EXEC_ENV_WINDOWS
+	unsigned long page_mask = 4095;
+#else
+	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
+#endif
+	unsigned long virt_page = virt_addr & ~page_mask;
 
 	memset(_op_data, 0, mempool->elt_size);
 
 	__rte_crypto_op_reset(op, type);
 
-	op->phys_addr = rte_mem_virt2iova(_op_data);
+	priv = (struct rte_crypto_op_pool_private *)
+		rte_mempool_get_priv(mempool);
+
+	if (virt_page == priv->vp_cache) {
+		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
+	} else {
+		op->phys_addr = rte_mem_virt2iova(_op_data);
+
+		/* Update cached values */
+		priv->vp_cache = virt_page;
+		priv->iovp_cache = op->phys_addr & ~page_mask;
+	}
+
 	op->mempool = mempool;
 }