[v1,3/3] gpu/cuda: mem alloc aligned memory
Checks
Commit Message
From: Elena Agostini <eagostini@nvidia.com>
Implement aligned GPU memory allocation in GPU CUDA driver.
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
drivers/gpu/cuda/cuda.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
Comments
On Tue, 4 Jan 2022 01:47:21 +0000
<eagostini@nvidia.com> wrote:
> static int
> -cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> +cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
> {
> CUresult res;
> const char *err_string;
> @@ -610,8 +612,10 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
>
> /* Allocate memory */
> mem_alloc_list_tail->size = size;
> - res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
> - mem_alloc_list_tail->size);
> + mem_alloc_list_tail->size_orig = size + align;
> +
> + res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
> + mem_alloc_list_tail->size_orig);
> if (res != 0) {
> pfn_cuGetErrorString(res, &(err_string));
> rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
> @@ -620,6 +624,13 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> return -rte_errno;
> }
>
> +
> + /* Align memory address */
> + mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
> + if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
> + mem_alloc_list_tail->ptr_d += (align -
> + (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
Posix memalign takes size_t for both size and alignment.
Better to put the input parameters first, and then the resulting output parameter last
for consistency; follows the Rusty API design manifesto.
Alignment only makes sense if power of two. The code should check that and optimize
for that.
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Monday, 3 January 2022 at 19:05
> To: Elena Agostini <eagostini@nvidia.com>
> Cc: dev@dpdk.org <dev@dpdk.org>
> Subject: Re: [PATCH v1 3/3] gpu/cuda: mem alloc aligned memory
> External email: Use caution opening links or attachments>
>
> On Tue, 4 Jan 2022 01:47:21 +0000
> <eagostini@nvidia.com> wrote:>
> > static int
> > -cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> > +cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
> > {
> > CUresult res;
> > const char *err_string;
> > @@ -610,8 +612,10 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> >
> > /* Allocate memory */
> > mem_alloc_list_tail->size = size;
> > - res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
> > - mem_alloc_list_tail->size);
> > + mem_alloc_list_tail->size_orig = size + align;
> > +
> > + res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
> > + mem_alloc_list_tail->size_orig);
> > if (res != 0) {
> > pfn_cuGetErrorString(res, &(err_string));
> > rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
> > @@ -620,6 +624,13 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
> > return -rte_errno;
> > }
> >
> > +
> > + /* Align memory address */
> > + mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
> > + if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
> > + mem_alloc_list_tail->ptr_d += (align -
> > + (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));>
>
> Posix memalign takes size_t for both size and alignment.
I've created this patch based on the rte_malloc function definition for consistency.
void * rte_malloc(const char *type, size_t size, unsigned align)
> Better to put the input parameters first, and then the resulting output parameter last
> for consistency; follows the Rusty API design manifesto.
Got it, will do.
> Alignment only makes sense if power of two. The code should check that and optimize
> for that.
>
The alignment value is checked in the gpudev library before
passing it to the driver.
Adding this kind of checks in the driver has been rejected in the past because it was
considered dead code (the library was already checking input parameters).
Let me know what are the preferred options.
On Mon, 3 Jan 2022 18:15:11 +0000
Elena Agostini <eagostini@nvidia.com> wrote:
> > Alignment only makes sense if power of two. The code should check that and optimize
> > for that.
> >
>
> The alignment value is checked in the gpudev library before
> passing it to the driver.
>
> Adding this kind of checks in the driver has been rejected in the past because it was
> considered dead code (the library was already checking input parameters).
>
> Let me know what are the preferred options.
Driver could use the mask instead of slow divide operation.
> On Mon, 3 Jan 2022 18:15:11 +0000
> Elena Agostini <eagostini@nvidia.com> wrote:
>
> > > Alignment only makes sense if power of two. The code should check that and optimize
> > > for that.
> > >
> >
> > The alignment value is checked in the gpudev library before
> > passing it to the driver.
> >
> > Adding this kind of checks in the driver has been rejected in the past because it was
> > considered dead code (the library was already checking input parameters).
> >
> > Let me know what are the preferred options.
>
> Driver could use the mask instead of slow divide operation.
I'd not be concerned about performance here.
Memory allocation is expensive, typically you want to do it
at initialization time.
What do you suggest for my other comments?
@@ -139,8 +139,10 @@ typedef uintptr_t cuda_ptr_key;
/* Single entry of the memory list */
struct mem_entry {
CUdeviceptr ptr_d;
+ CUdeviceptr ptr_orig_d;
void *ptr_h;
size_t size;
+ size_t size_orig;
struct rte_gpu *dev;
CUcontext ctx;
cuda_ptr_key pkey;
@@ -569,7 +571,7 @@ cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
*/
static int
-cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr, unsigned int align)
{
CUresult res;
const char *err_string;
@@ -610,8 +612,10 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
/* Allocate memory */
mem_alloc_list_tail->size = size;
- res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
- mem_alloc_list_tail->size);
+ mem_alloc_list_tail->size_orig = size + align;
+
+ res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
+ mem_alloc_list_tail->size_orig);
if (res != 0) {
pfn_cuGetErrorString(res, &(err_string));
rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
@@ -620,6 +624,13 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
return -rte_errno;
}
+
+ /* Align memory address */
+ mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
+ if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
+ mem_alloc_list_tail->ptr_d += (align -
+ (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
+
/* GPUDirect RDMA attribute required */
res = pfn_cuPointerSetAttribute(&flag,
CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
@@ -634,7 +645,6 @@ cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
mem_alloc_list_tail->ptr_h = NULL;
- mem_alloc_list_tail->size = size;
mem_alloc_list_tail->dev = dev;
mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
mem_alloc_list_tail->mtype = GPU_MEM;
@@ -761,6 +771,7 @@ cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
mem_alloc_list_tail->dev = dev;
mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
mem_alloc_list_tail->mtype = CPU_REGISTERED;
+ mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
/* Restore original ctx as current ctx */
res = pfn_cuCtxSetCurrent(current_ctx);
@@ -796,7 +807,7 @@ cuda_mem_free(struct rte_gpu *dev, void *ptr)
}
if (mem_item->mtype == GPU_MEM) {
- res = pfn_cuMemFree(mem_item->ptr_d);
+ res = pfn_cuMemFree(mem_item->ptr_orig_d);
if (res != 0) {
pfn_cuGetErrorString(res, &(err_string));
rte_cuda_log(ERR, "cuMemFree current failed with %s",