[v1,3/3] gpu/cuda: mem alloc aligned memory

Message ID 20220104014721.1799-4-eagostini@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series GPU memory aligned |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/github-robot: build success github build: passed
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-abi-testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Elena Agostini Jan. 4, 2022, 1:47 a.m. UTC
  From: Elena Agostini <eagostini@nvidia.com>

Implement aligned GPU memory allocation in GPU CUDA driver.

Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 drivers/gpu/cuda/cuda.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)
  

Comments

Stephen Hemminger Jan. 3, 2022, 6:05 p.m. UTC | #1
On Tue, 4 Jan 2022 01:47:21 +0000
<eagostini@nvidia.com> wrote:

>  static int
> -cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> +cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
>  {
>  	CUresult res;
>  	const char *err_string;
> @@ -610,8 +612,10 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
>  
>  	/* Allocate memory */
>  	mem_alloc_list_tail->size = size;
> -	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
> -			mem_alloc_list_tail->size);
> +	mem_alloc_list_tail->size_orig = size + align;
> +
> +	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
> +			mem_alloc_list_tail->size_orig);
>  	if (res != 0) {
>  		pfn_cuGetErrorString(res, &(err_string));
>  		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
> @@ -620,6 +624,13 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
>  		return -rte_errno;
>  	}
>  
> +
> +	/* Align memory address */
> +	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
> +	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
> +		mem_alloc_list_tail->ptr_d += (align -
> +				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));


Posix memalign takes size_t for both size and alignment.

Better to put the input parameters first, and then the resulting output parameter last
for consistency; follows the Rusty API design manifesto.

Alignment only makes sense if power of two. The code should check that and optimize
for that.
  
Elena Agostini Jan. 3, 2022, 6:15 p.m. UTC | #2
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Monday, 3 January 2022 at 19:05
> To: Elena Agostini <eagostini@nvidia.com>
> Cc: dev@dpdk.org <dev@dpdk.org>
> Subject: Re: [PATCH v1 3/3] gpu/cuda: mem alloc aligned memory
> External email: Use caution opening links or attachments>
>

> On Tue, 4 Jan 2022 01:47:21 +0000
> <eagostini@nvidia.com> wrote:>

> >  static int
> > -cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> > +cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
> >  {
> >       CUresult res;
> >       const char *err_string;
> > @@ -610,8 +612,10 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> >
> >       /* Allocate memory */
> >       mem_alloc_list_tail->size = size;
> > -     res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
> > -                     mem_alloc_list_tail->size);
> > +     mem_alloc_list_tail->size_orig = size + align;
> > +
> > +     res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
> > +                     mem_alloc_list_tail->size_orig);
> >       if (res != 0) {
> >               pfn_cuGetErrorString(res, &(err_string));
> >               rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
> > @@ -620,6 +624,13 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> >               return -rte_errno;
> >       }
> >
> > +
> > +     /* Align memory address */
> > +     mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
> > +     if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
> > +             mem_alloc_list_tail->ptr_d += (align -
> > +                             (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));>
>

> Posix memalign takes size_t for both size and alignment.

I've created this patch based on the rte_malloc function definition for consistency.

void * rte_malloc(const char *type, size_t size, unsigned align)


> Better to put the input parameters first, and then the resulting output parameter last
> for consistency; follows the Rusty API design manifesto.

Got it, will do.

> Alignment only makes sense if power of two. The code should check that and optimize
> for that.
>

The alignment value is checked in the gpudev library before
passing it to the driver.

Adding this kind of checks in the driver has been rejected in the past because it was
considered dead code (the library was already checking input parameters).

Let me know what are the preferred options.
  
Stephen Hemminger Jan. 3, 2022, 6:17 p.m. UTC | #3
On Mon, 3 Jan 2022 18:15:11 +0000
Elena Agostini <eagostini@nvidia.com> wrote:

> > Alignment only makes sense if power of two. The code should check that and optimize
> > for that.
> >  
> 
> The alignment value is checked in the gpudev library before
> passing it to the driver.
> 
> Adding this kind of checks in the driver has been rejected in the past because it was
> considered dead code (the library was already checking input parameters).
> 
> Let me know what are the preferred options.

Driver could use the mask instead of slow divide operation.
  
Elena Agostini Jan. 3, 2022, 6:22 p.m. UTC | #4
> On Mon, 3 Jan 2022 18:15:11 +0000
> Elena Agostini <eagostini@nvidia.com> wrote:
>
> > > Alignment only makes sense if power of two. The code should check that and optimize
> > > for that.
> > >
> >
> > The alignment value is checked in the gpudev library before
> > passing it to the driver.
> >
> > Adding this kind of checks in the driver has been rejected in the past because it was
> > considered dead code (the library was already checking input parameters).
> >
> > Let me know what are the preferred options.
>
> Driver could use the mask instead of slow divide operation.

I'd not be concerned about performance here.
Memory allocation is expensive, typically you want to do it
at initialization time.

What do you suggest for my other comments?
  

Patch

diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
index 882df08e56..4ad3f5fc90 100644
--- a/drivers/gpu/cuda/cuda.c
+++ b/drivers/gpu/cuda/cuda.c
@@ -139,8 +139,10 @@  typedef uintptr_t cuda_ptr_key;
 /* Single entry of the memory list */
 struct mem_entry {
 	CUdeviceptr ptr_d;
+	CUdeviceptr ptr_orig_d;
 	void *ptr_h;
 	size_t size;
+	size_t size_orig;
 	struct rte_gpu *dev;
 	CUcontext ctx;
 	cuda_ptr_key pkey;
@@ -569,7 +571,7 @@  cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
  */
 
 static int
-cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
 {
 	CUresult res;
 	const char *err_string;
@@ -610,8 +612,10 @@  cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
 
 	/* Allocate memory */
 	mem_alloc_list_tail->size = size;
-	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
-			mem_alloc_list_tail->size);
+	mem_alloc_list_tail->size_orig = size + align;
+
+	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
+			mem_alloc_list_tail->size_orig);
 	if (res != 0) {
 		pfn_cuGetErrorString(res, &(err_string));
 		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
@@ -620,6 +624,13 @@  cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
 		return -rte_errno;
 	}
 
+
+	/* Align memory address */
+	mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
+	if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
+		mem_alloc_list_tail->ptr_d += (align -
+				(((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
+
 	/* GPUDirect RDMA attribute required */
 	res = pfn_cuPointerSetAttribute(&flag,
 			CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
@@ -634,7 +645,6 @@  cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
 
 	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 	mem_alloc_list_tail->ptr_h = NULL;
-	mem_alloc_list_tail->size = size;
 	mem_alloc_list_tail->dev = dev;
 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 	mem_alloc_list_tail->mtype = GPU_MEM;
@@ -761,6 +771,7 @@  cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 	mem_alloc_list_tail->dev = dev;
 	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+	mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
 
 	/* Restore original ctx as current ctx */
 	res = pfn_cuCtxSetCurrent(current_ctx);
@@ -796,7 +807,7 @@  cuda_mem_free(struct rte_gpu *dev, void *ptr)
 	}
 
 	if (mem_item->mtype == GPU_MEM) {
-		res = pfn_cuMemFree(mem_item->ptr_d);
+		res = pfn_cuMemFree(mem_item->ptr_orig_d);
 		if (res != 0) {
 			pfn_cuGetErrorString(res, &(err_string));
 			rte_cuda_log(ERR, "cuMemFree current failed with %s",