diff mbox series

[2/3] mem: fix ASan shadow for remapped memory segments

Message ID 20220415173127.3838-3-david.marchand@redhat.com (mailing list archive)
State Changes Requested
Delegated to: David Marchand
Headers show
Series Enable ASan in GHA | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

David Marchand April 15, 2022, 5:31 p.m. UTC
When releasing some memory, the allocator can choose to return some
pages to the OS. At the same time, this memory was poisoned in ASAn
shadow. Doing the latter made it impossible to remap this same page
later.
On the other hand, without this poison, the OS would pagefault in any
case for this page.

Remove the poisoning for unmapped pages.

Bugzilla ID: 994
Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
Cc: stable@dpdk.org

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
 lib/eal/common/malloc_elem.h |  4 ++++
 lib/eal/common/malloc_heap.c | 12 +++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

Comments

Burakov, Anatoly April 20, 2022, 2:47 p.m. UTC | #1
On 15-Apr-22 6:31 PM, David Marchand wrote:
> When releasing some memory, the allocator can choose to return some
> pages to the OS. At the same time, this memory was poisoned in ASAn
> shadow. Doing the latter made it impossible to remap this same page
> later.
> On the other hand, without this poison, the OS would pagefault in any
> case for this page.
> 
> Remove the poisoning for unmapped pages.
> 
> Bugzilla ID: 994
> Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
> Cc: stable@dpdk.org
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
>   lib/eal/common/malloc_elem.h |  4 ++++
>   lib/eal/common/malloc_heap.c | 12 +++++++++++-
>   2 files changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
> index 228f178418..b859003722 100644
> --- a/lib/eal/common/malloc_elem.h
> +++ b/lib/eal/common/malloc_elem.h
> @@ -272,6 +272,10 @@ old_malloc_size(struct malloc_elem *elem)
>   
>   #else /* !RTE_MALLOC_ASAN */
>   
> +static inline void
> +asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
> +	uint32_t val __rte_unused) { }
> +
>   static inline void
>   asan_set_freezone(void *ptr __rte_unused, size_t size __rte_unused) { }
>   
> diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
> index 6c572b6f2c..5913d9f862 100644
> --- a/lib/eal/common/malloc_heap.c
> +++ b/lib/eal/common/malloc_heap.c
> @@ -860,6 +860,7 @@ malloc_heap_free(struct malloc_elem *elem)
>   	size_t len, aligned_len, page_sz;
>   	struct rte_memseg_list *msl;
>   	unsigned int i, n_segs, before_space, after_space;
> +	bool unmapped_pages = false;
>   	int ret;
>   	const struct internal_config *internal_conf =
>   		eal_get_internal_configuration();
> @@ -999,6 +1000,13 @@ malloc_heap_free(struct malloc_elem *elem)
>   
>   		/* don't care if any of this fails */
>   		malloc_heap_free_pages(aligned_start, aligned_len);
> +		/*
> +		 * Clear any poisoning in ASan for the associated pages so that
> +		 * next time EAL maps those pages, the allocator can access
> +		 * them.
> +		 */
> +		asan_set_zone(aligned_start, aligned_len, 0x00);
> +		unmapped_pages = true;
>   
>   		request_sync();
>   	} else {
> @@ -1032,7 +1040,9 @@ malloc_heap_free(struct malloc_elem *elem)
>   
>   	rte_mcfg_mem_write_unlock();
>   free_unlock:
> -	asan_set_freezone(asan_ptr, asan_data_len);
> +	/* Poison memory range if belonging to some still mapped pages. */
> +	if (!unmapped_pages)
> +		asan_set_freezone(asan_ptr, asan_data_len);
>   
>   	rte_spinlock_unlock(&(heap->lock));
>   	return ret;

I suspect the patch should be a little more complicated than that. When 
we unmap pages, we don't necessarily unmap the entire malloc element, it 
could be that we have a freed allocation like so:

| malloc header | free space | unmapped space | free space | next malloc 
header |

So, i think the freezone should be set from asan_ptr till aligned_start, 
and then from (aligned_start + aligned_len) till (asan_ptr + 
asan_data_len). Does that make sense?
David Marchand April 21, 2022, 9:37 a.m. UTC | #2
On Wed, Apr 20, 2022 at 4:47 PM Burakov, Anatoly
<anatoly.burakov@intel.com> wrote:
>
> On 15-Apr-22 6:31 PM, David Marchand wrote:
> > When releasing some memory, the allocator can choose to return some
> > pages to the OS. At the same time, this memory was poisoned in ASAn
> > shadow. Doing the latter made it impossible to remap this same page
> > later.
> > On the other hand, without this poison, the OS would pagefault in any
> > case for this page.
> >
> > Remove the poisoning for unmapped pages.
> >
> > Bugzilla ID: 994
> > Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
> >   lib/eal/common/malloc_elem.h |  4 ++++
> >   lib/eal/common/malloc_heap.c | 12 +++++++++++-
> >   2 files changed, 15 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
> > index 228f178418..b859003722 100644
> > --- a/lib/eal/common/malloc_elem.h
> > +++ b/lib/eal/common/malloc_elem.h
> > @@ -272,6 +272,10 @@ old_malloc_size(struct malloc_elem *elem)
> >
> >   #else /* !RTE_MALLOC_ASAN */
> >
> > +static inline void
> > +asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
> > +     uint32_t val __rte_unused) { }
> > +
> >   static inline void
> >   asan_set_freezone(void *ptr __rte_unused, size_t size __rte_unused) { }
> >
> > diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
> > index 6c572b6f2c..5913d9f862 100644
> > --- a/lib/eal/common/malloc_heap.c
> > +++ b/lib/eal/common/malloc_heap.c
> > @@ -860,6 +860,7 @@ malloc_heap_free(struct malloc_elem *elem)
> >       size_t len, aligned_len, page_sz;
> >       struct rte_memseg_list *msl;
> >       unsigned int i, n_segs, before_space, after_space;
> > +     bool unmapped_pages = false;
> >       int ret;
> >       const struct internal_config *internal_conf =
> >               eal_get_internal_configuration();
> > @@ -999,6 +1000,13 @@ malloc_heap_free(struct malloc_elem *elem)
> >
> >               /* don't care if any of this fails */
> >               malloc_heap_free_pages(aligned_start, aligned_len);
> > +             /*
> > +              * Clear any poisoning in ASan for the associated pages so that
> > +              * next time EAL maps those pages, the allocator can access
> > +              * them.
> > +              */
> > +             asan_set_zone(aligned_start, aligned_len, 0x00);
> > +             unmapped_pages = true;
> >
> >               request_sync();
> >       } else {
> > @@ -1032,7 +1040,9 @@ malloc_heap_free(struct malloc_elem *elem)
> >
> >       rte_mcfg_mem_write_unlock();
> >   free_unlock:
> > -     asan_set_freezone(asan_ptr, asan_data_len);
> > +     /* Poison memory range if belonging to some still mapped pages. */
> > +     if (!unmapped_pages)
> > +             asan_set_freezone(asan_ptr, asan_data_len);
> >
> >       rte_spinlock_unlock(&(heap->lock));
> >       return ret;
>
> I suspect the patch should be a little more complicated than that. When
> we unmap pages, we don't necessarily unmap the entire malloc element, it
> could be that we have a freed allocation like so:
>
> | malloc header | free space | unmapped space | free space | next malloc
> header |
>
> So, i think the freezone should be set from asan_ptr till aligned_start,
> and then from (aligned_start + aligned_len) till (asan_ptr +
> asan_data_len). Does that make sense?

(btw, I get a bounce for Zhihong mail address, is he not working at
Intel anymore?)

To be honest, I don't understand if we can get to this situation :-)
(especially the free space after the unmapped region).
But I guess you mean something like (on top of current patch):

@@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)

        rte_mcfg_mem_write_unlock();
 free_unlock:
-       /* Poison memory range if belonging to some still mapped pages. */
-       if (!unmapped_pages)
+       if (!unmapped_pages) {
                asan_set_freezone(asan_ptr, asan_data_len);
+       } else {
+               /*
+                * We may be in a situation where we unmapped pages like this:
+                * malloc header | free space | unmapped space | free
space | malloc header
+                */
+               void *free1_start = asan_ptr;
+               void *free1_end = aligned_start;
+               void *free2_start = RTE_PTR_ADD(aligned_start, aligned_len);
+               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
+
+               if (free1_start < free1_end)
+                       asan_set_freezone(free1_start,
+                               RTE_PTR_DIFF(free1_end, free1_start));
+               if (free2_start < free2_end)
+                       asan_set_freezone(free2_start,
+                               RTE_PTR_DIFF(free2_end, free2_start));
+       }

        rte_spinlock_unlock(&(heap->lock));
        return ret;
David Marchand April 21, 2022, 9:50 a.m. UTC | #3
On Thu, Apr 21, 2022 at 11:37 AM David Marchand
<david.marchand@redhat.com> wrote:
>
> On Wed, Apr 20, 2022 at 4:47 PM Burakov, Anatoly
> <anatoly.burakov@intel.com> wrote:
> >
> > On 15-Apr-22 6:31 PM, David Marchand wrote:
> > > When releasing some memory, the allocator can choose to return some
> > > pages to the OS. At the same time, this memory was poisoned in ASAn
> > > shadow. Doing the latter made it impossible to remap this same page
> > > later.
> > > On the other hand, without this poison, the OS would pagefault in any
> > > case for this page.
> > >
> > > Remove the poisoning for unmapped pages.
> > >
> > > Bugzilla ID: 994
> > > Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > > ---
> > >   lib/eal/common/malloc_elem.h |  4 ++++
> > >   lib/eal/common/malloc_heap.c | 12 +++++++++++-
> > >   2 files changed, 15 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
> > > index 228f178418..b859003722 100644
> > > --- a/lib/eal/common/malloc_elem.h
> > > +++ b/lib/eal/common/malloc_elem.h
> > > @@ -272,6 +272,10 @@ old_malloc_size(struct malloc_elem *elem)
> > >
> > >   #else /* !RTE_MALLOC_ASAN */
> > >
> > > +static inline void
> > > +asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
> > > +     uint32_t val __rte_unused) { }
> > > +
> > >   static inline void
> > >   asan_set_freezone(void *ptr __rte_unused, size_t size __rte_unused) { }
> > >
> > > diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
> > > index 6c572b6f2c..5913d9f862 100644
> > > --- a/lib/eal/common/malloc_heap.c
> > > +++ b/lib/eal/common/malloc_heap.c
> > > @@ -860,6 +860,7 @@ malloc_heap_free(struct malloc_elem *elem)
> > >       size_t len, aligned_len, page_sz;
> > >       struct rte_memseg_list *msl;
> > >       unsigned int i, n_segs, before_space, after_space;
> > > +     bool unmapped_pages = false;
> > >       int ret;
> > >       const struct internal_config *internal_conf =
> > >               eal_get_internal_configuration();
> > > @@ -999,6 +1000,13 @@ malloc_heap_free(struct malloc_elem *elem)
> > >
> > >               /* don't care if any of this fails */
> > >               malloc_heap_free_pages(aligned_start, aligned_len);
> > > +             /*
> > > +              * Clear any poisoning in ASan for the associated pages so that
> > > +              * next time EAL maps those pages, the allocator can access
> > > +              * them.
> > > +              */
> > > +             asan_set_zone(aligned_start, aligned_len, 0x00);
> > > +             unmapped_pages = true;
> > >
> > >               request_sync();
> > >       } else {
> > > @@ -1032,7 +1040,9 @@ malloc_heap_free(struct malloc_elem *elem)
> > >
> > >       rte_mcfg_mem_write_unlock();
> > >   free_unlock:
> > > -     asan_set_freezone(asan_ptr, asan_data_len);
> > > +     /* Poison memory range if belonging to some still mapped pages. */
> > > +     if (!unmapped_pages)
> > > +             asan_set_freezone(asan_ptr, asan_data_len);
> > >
> > >       rte_spinlock_unlock(&(heap->lock));
> > >       return ret;
> >
> > I suspect the patch should be a little more complicated than that. When
> > we unmap pages, we don't necessarily unmap the entire malloc element, it
> > could be that we have a freed allocation like so:
> >
> > | malloc header | free space | unmapped space | free space | next malloc
> > header |
> >
> > So, i think the freezone should be set from asan_ptr till aligned_start,
> > and then from (aligned_start + aligned_len) till (asan_ptr +
> > asan_data_len). Does that make sense?
>
> (btw, I get a bounce for Zhihong mail address, is he not working at
> Intel anymore?)
>
> To be honest, I don't understand if we can get to this situation :-)
> (especially the free space after the unmapped region).
> But I guess you mean something like (on top of current patch):
>
> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
>
>         rte_mcfg_mem_write_unlock();
>  free_unlock:
> -       /* Poison memory range if belonging to some still mapped pages. */
> -       if (!unmapped_pages)
> +       if (!unmapped_pages) {
>                 asan_set_freezone(asan_ptr, asan_data_len);
> +       } else {
> +               /*
> +                * We may be in a situation where we unmapped pages like this:
> +                * malloc header | free space | unmapped space | free
> space | malloc header
> +                */
> +               void *free1_start = asan_ptr;
> +               void *free1_end = aligned_start;
> +               void *free2_start = RTE_PTR_ADD(aligned_start, aligned_len);
> +               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
> +
> +               if (free1_start < free1_end)
> +                       asan_set_freezone(free1_start,
> +                               RTE_PTR_DIFF(free1_end, free1_start));
> +               if (free2_start < free2_end)
> +                       asan_set_freezone(free2_start,
> +                               RTE_PTR_DIFF(free2_end, free2_start));
> +       }
>
>         rte_spinlock_unlock(&(heap->lock));
>         return ret;

But I get a splat for func_reentrancy_autotest:

=================================================================
==4098809==ERROR: AddressSanitizer: heap-use-after-free on address
0x7fa00fa00030 at pc 0x0000035779fe bp 0x7ffe01ed0cf0 sp
0x7ffe01ed0ce8
READ of size 1 at 0x7fa00fa00030 thread T0
    #0 0x35779fd in malloc_elem_join_adjacent_free
../lib/eal/common/malloc_elem.c:539
    #1 0x3577bc5 in malloc_elem_free ../lib/eal/common/malloc_elem.c:586
    #2 0x357bd25 in malloc_heap_free ../lib/eal/common/malloc_heap.c:886
    #3 0x357e032 in mem_free ../lib/eal/common/rte_malloc.c:37
    #4 0x357e032 in rte_free ../lib/eal/common/rte_malloc.c:44
    #5 0x336a547 in rte_lpm_free ../lib/lpm/rte_lpm.c:281
    #6 0x2cb3488 in lpm_create_free ../app/test/test_func_reentrancy.c:395
    #7 0x2cb47ed in launch_test ../app/test/test_func_reentrancy.c:455
    #8 0x2cb47ed in test_func_reentrancy ../app/test/test_func_reentrancy.c:502
    #9 0x2b0dd75 in cmd_autotest_parsed ../app/test/commands.c:68
    #10 0x34e5dce in cmdline_parse ../lib/cmdline/cmdline_parse.c:287
    #11 0x34e31af in cmdline_valid_buffer ../lib/cmdline/cmdline.c:24
    #12 0x34eba9f in rdline_char_in ../lib/cmdline/cmdline_rdline.c:444
    #13 0x34e329b in cmdline_in ../lib/cmdline/cmdline.c:146
    #14 0x34e329b in cmdline_in ../lib/cmdline/cmdline.c:135
    #15 0x40d8a5 in main ../app/test/test.c:217
    #16 0x7fa41009b55f in __libc_start_call_main
../sysdeps/nptl/libc_start_call_main.h:58
    #17 0x7fa41009b60b in __libc_start_main_impl ../csu/libc-start.c:409
    #18 0x2b0dc64 in _start
(/home/dmarchan/dpdk/build-gcc-asan/app/test/dpdk-test+0x2b0dc64)

Address 0x7fa00fa00030 is a wild pointer.
SUMMARY: AddressSanitizer: heap-use-after-free
../lib/eal/common/malloc_elem.c:539 in malloc_elem_join_adjacent_free
Shadow bytes around the buggy address:
  0x0ff481f37fb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  0x0ff481f37fc0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  0x0ff481f37fd0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  0x0ff481f37fe0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  0x0ff481f37ff0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
=>0x0ff481f38000: fd fd fd fd fd fd[fd]fd fd fd fd fd fd fd fd fd
  0x0ff481f38010: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0ff481f38020: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0ff481f38030: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0ff481f38040: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0ff481f38050: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
Shadow byte legend (one shadow byte represents 8 application bytes):
Burakov, Anatoly April 21, 2022, 1:18 p.m. UTC | #4
On 21-Apr-22 10:37 AM, David Marchand wrote:
> On Wed, Apr 20, 2022 at 4:47 PM Burakov, Anatoly
> <anatoly.burakov@intel.com> wrote:
>>
>> On 15-Apr-22 6:31 PM, David Marchand wrote:
>>> When releasing some memory, the allocator can choose to return some
>>> pages to the OS. At the same time, this memory was poisoned in ASAn
>>> shadow. Doing the latter made it impossible to remap this same page
>>> later.
>>> On the other hand, without this poison, the OS would pagefault in any
>>> case for this page.
>>>
>>> Remove the poisoning for unmapped pages.
>>>
>>> Bugzilla ID: 994
>>> Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
>>> Cc: stable@dpdk.org
>>>
>>> Signed-off-by: David Marchand <david.marchand@redhat.com>
>>> ---
>>>    lib/eal/common/malloc_elem.h |  4 ++++
>>>    lib/eal/common/malloc_heap.c | 12 +++++++++++-
>>>    2 files changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
>>> index 228f178418..b859003722 100644
>>> --- a/lib/eal/common/malloc_elem.h
>>> +++ b/lib/eal/common/malloc_elem.h
>>> @@ -272,6 +272,10 @@ old_malloc_size(struct malloc_elem *elem)
>>>
>>>    #else /* !RTE_MALLOC_ASAN */
>>>
>>> +static inline void
>>> +asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
>>> +     uint32_t val __rte_unused) { }
>>> +
>>>    static inline void
>>>    asan_set_freezone(void *ptr __rte_unused, size_t size __rte_unused) { }
>>>
>>> diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
>>> index 6c572b6f2c..5913d9f862 100644
>>> --- a/lib/eal/common/malloc_heap.c
>>> +++ b/lib/eal/common/malloc_heap.c
>>> @@ -860,6 +860,7 @@ malloc_heap_free(struct malloc_elem *elem)
>>>        size_t len, aligned_len, page_sz;
>>>        struct rte_memseg_list *msl;
>>>        unsigned int i, n_segs, before_space, after_space;
>>> +     bool unmapped_pages = false;
>>>        int ret;
>>>        const struct internal_config *internal_conf =
>>>                eal_get_internal_configuration();
>>> @@ -999,6 +1000,13 @@ malloc_heap_free(struct malloc_elem *elem)
>>>
>>>                /* don't care if any of this fails */
>>>                malloc_heap_free_pages(aligned_start, aligned_len);
>>> +             /*
>>> +              * Clear any poisoning in ASan for the associated pages so that
>>> +              * next time EAL maps those pages, the allocator can access
>>> +              * them.
>>> +              */
>>> +             asan_set_zone(aligned_start, aligned_len, 0x00);
>>> +             unmapped_pages = true;
>>>
>>>                request_sync();
>>>        } else {
>>> @@ -1032,7 +1040,9 @@ malloc_heap_free(struct malloc_elem *elem)
>>>
>>>        rte_mcfg_mem_write_unlock();
>>>    free_unlock:
>>> -     asan_set_freezone(asan_ptr, asan_data_len);
>>> +     /* Poison memory range if belonging to some still mapped pages. */
>>> +     if (!unmapped_pages)
>>> +             asan_set_freezone(asan_ptr, asan_data_len);
>>>
>>>        rte_spinlock_unlock(&(heap->lock));
>>>        return ret;
>>
>> I suspect the patch should be a little more complicated than that. When
>> we unmap pages, we don't necessarily unmap the entire malloc element, it
>> could be that we have a freed allocation like so:
>>
>> | malloc header | free space | unmapped space | free space | next malloc
>> header |
>>
>> So, i think the freezone should be set from asan_ptr till aligned_start,
>> and then from (aligned_start + aligned_len) till (asan_ptr +
>> asan_data_len). Does that make sense?
> 
> (btw, I get a bounce for Zhihong mail address, is he not working at
> Intel anymore?)
> 
> To be honest, I don't understand if we can get to this situation :-)
> (especially the free space after the unmapped region).
> But I guess you mean something like (on top of current patch):
> 
> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
> 
>          rte_mcfg_mem_write_unlock();
>   free_unlock:
> -       /* Poison memory range if belonging to some still mapped pages. */
> -       if (!unmapped_pages)
> +       if (!unmapped_pages) {
>                  asan_set_freezone(asan_ptr, asan_data_len);
> +       } else {
> +               /*
> +                * We may be in a situation where we unmapped pages like this:
> +                * malloc header | free space | unmapped space | free
> space | malloc header
> +                */
> +               void *free1_start = asan_ptr;
> +               void *free1_end = aligned_start;
> +               void *free2_start = RTE_PTR_ADD(aligned_start, aligned_len);
> +               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
> +
> +               if (free1_start < free1_end)
> +                       asan_set_freezone(free1_start,
> +                               RTE_PTR_DIFF(free1_end, free1_start));
> +               if (free2_start < free2_end)
> +                       asan_set_freezone(free2_start,
> +                               RTE_PTR_DIFF(free2_end, free2_start));
> +       }
> 
>          rte_spinlock_unlock(&(heap->lock));
>          return ret;
> 

Something like that, yes. I will have to think through this a bit more, 
especially in light of your func_reentrancy splat :)
Burakov, Anatoly April 26, 2022, 12:54 p.m. UTC | #5
On 21-Apr-22 2:18 PM, Burakov, Anatoly wrote:
> On 21-Apr-22 10:37 AM, David Marchand wrote:
>> On Wed, Apr 20, 2022 at 4:47 PM Burakov, Anatoly
>> <anatoly.burakov@intel.com> wrote:
>>>
>>> On 15-Apr-22 6:31 PM, David Marchand wrote:
>>>> When releasing some memory, the allocator can choose to return some
>>>> pages to the OS. At the same time, this memory was poisoned in ASAn
>>>> shadow. Doing the latter made it impossible to remap this same page
>>>> later.
>>>> On the other hand, without this poison, the OS would pagefault in any
>>>> case for this page.
>>>>
>>>> Remove the poisoning for unmapped pages.
>>>>
>>>> Bugzilla ID: 994
>>>> Fixes: 6cc51b1293ce ("mem: instrument allocator for ASan")
>>>> Cc: stable@dpdk.org
>>>>
>>>> Signed-off-by: David Marchand <david.marchand@redhat.com>
>>>> ---
>>>>    lib/eal/common/malloc_elem.h |  4 ++++
>>>>    lib/eal/common/malloc_heap.c | 12 +++++++++++-
>>>>    2 files changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/lib/eal/common/malloc_elem.h 
>>>> b/lib/eal/common/malloc_elem.h
>>>> index 228f178418..b859003722 100644
>>>> --- a/lib/eal/common/malloc_elem.h
>>>> +++ b/lib/eal/common/malloc_elem.h
>>>> @@ -272,6 +272,10 @@ old_malloc_size(struct malloc_elem *elem)
>>>>
>>>>    #else /* !RTE_MALLOC_ASAN */
>>>>
>>>> +static inline void
>>>> +asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
>>>> +     uint32_t val __rte_unused) { }
>>>> +
>>>>    static inline void
>>>>    asan_set_freezone(void *ptr __rte_unused, size_t size 
>>>> __rte_unused) { }
>>>>
>>>> diff --git a/lib/eal/common/malloc_heap.c 
>>>> b/lib/eal/common/malloc_heap.c
>>>> index 6c572b6f2c..5913d9f862 100644
>>>> --- a/lib/eal/common/malloc_heap.c
>>>> +++ b/lib/eal/common/malloc_heap.c
>>>> @@ -860,6 +860,7 @@ malloc_heap_free(struct malloc_elem *elem)
>>>>        size_t len, aligned_len, page_sz;
>>>>        struct rte_memseg_list *msl;
>>>>        unsigned int i, n_segs, before_space, after_space;
>>>> +     bool unmapped_pages = false;
>>>>        int ret;
>>>>        const struct internal_config *internal_conf =
>>>>                eal_get_internal_configuration();
>>>> @@ -999,6 +1000,13 @@ malloc_heap_free(struct malloc_elem *elem)
>>>>
>>>>                /* don't care if any of this fails */
>>>>                malloc_heap_free_pages(aligned_start, aligned_len);
>>>> +             /*
>>>> +              * Clear any poisoning in ASan for the associated 
>>>> pages so that
>>>> +              * next time EAL maps those pages, the allocator can 
>>>> access
>>>> +              * them.
>>>> +              */
>>>> +             asan_set_zone(aligned_start, aligned_len, 0x00);
>>>> +             unmapped_pages = true;
>>>>
>>>>                request_sync();
>>>>        } else {
>>>> @@ -1032,7 +1040,9 @@ malloc_heap_free(struct malloc_elem *elem)
>>>>
>>>>        rte_mcfg_mem_write_unlock();
>>>>    free_unlock:
>>>> -     asan_set_freezone(asan_ptr, asan_data_len);
>>>> +     /* Poison memory range if belonging to some still mapped 
>>>> pages. */
>>>> +     if (!unmapped_pages)
>>>> +             asan_set_freezone(asan_ptr, asan_data_len);
>>>>
>>>>        rte_spinlock_unlock(&(heap->lock));
>>>>        return ret;
>>>
>>> I suspect the patch should be a little more complicated than that. When
>>> we unmap pages, we don't necessarily unmap the entire malloc element, it
>>> could be that we have a freed allocation like so:
>>>
>>> | malloc header | free space | unmapped space | free space | next malloc
>>> header |
>>>
>>> So, i think the freezone should be set from asan_ptr till aligned_start,
>>> and then from (aligned_start + aligned_len) till (asan_ptr +
>>> asan_data_len). Does that make sense?
>>
>> (btw, I get a bounce for Zhihong mail address, is he not working at
>> Intel anymore?)
>>
>> To be honest, I don't understand if we can get to this situation :-)
>> (especially the free space after the unmapped region).
>> But I guess you mean something like (on top of current patch):
>>
>> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
>>
>>          rte_mcfg_mem_write_unlock();
>>   free_unlock:
>> -       /* Poison memory range if belonging to some still mapped 
>> pages. */
>> -       if (!unmapped_pages)
>> +       if (!unmapped_pages) {
>>                  asan_set_freezone(asan_ptr, asan_data_len);
>> +       } else {
>> +               /*
>> +                * We may be in a situation where we unmapped pages 
>> like this:
>> +                * malloc header | free space | unmapped space | free
>> space | malloc header
>> +                */
>> +               void *free1_start = asan_ptr;
>> +               void *free1_end = aligned_start;
>> +               void *free2_start = RTE_PTR_ADD(aligned_start, 
>> aligned_len);
>> +               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
>> +
>> +               if (free1_start < free1_end)
>> +                       asan_set_freezone(free1_start,
>> +                               RTE_PTR_DIFF(free1_end, free1_start));
>> +               if (free2_start < free2_end)
>> +                       asan_set_freezone(free2_start,
>> +                               RTE_PTR_DIFF(free2_end, free2_start));
>> +       }
>>
>>          rte_spinlock_unlock(&(heap->lock));
>>          return ret;
>>
> 
> Something like that, yes. I will have to think through this a bit more, 
> especially in light of your func_reentrancy splat :)
> 

So, the reason splat in func_reentrancy test happens is as follows: the 
above patch is sorta correct (i have a different one but does the same 
thing), but incomplete. What happens then is when we add new memory, we 
are integrating it into our existing malloc heap, which triggers 
`malloc_elem_join_adjacent_free()` which will trigger a write into old 
header space being merged, which may be marked as "freed". So, again we 
are hit with our internal allocator messing with ASan.

To properly fix this is to answer the following question: what is the 
goal of having ASan support in DPDK? Is it there to catch bugs *in the 
allocator*, or can we just trust that our allocator code is correct, and 
only concern ourselves with user-allocated areas of the code? Because it 
seems like the best way to address this issue would be to just avoid 
triggering ASan checks for certain allocator-internal actions: this way, 
we don't need to care what allocator itself does, just what user code 
does. As in, IIRC there was a compiler attribute that disables ASan 
checks for a specific function: perhaps we could just wrap certain 
access in that and be done with it?

What do you think?
David Marchand April 26, 2022, 2:15 p.m. UTC | #6
On Tue, Apr 26, 2022 at 2:54 PM Burakov, Anatoly
<anatoly.burakov@intel.com> wrote:
> >> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
> >>
> >>          rte_mcfg_mem_write_unlock();
> >>   free_unlock:
> >> -       /* Poison memory range if belonging to some still mapped
> >> pages. */
> >> -       if (!unmapped_pages)
> >> +       if (!unmapped_pages) {
> >>                  asan_set_freezone(asan_ptr, asan_data_len);
> >> +       } else {
> >> +               /*
> >> +                * We may be in a situation where we unmapped pages
> >> like this:
> >> +                * malloc header | free space | unmapped space | free
> >> space | malloc header
> >> +                */
> >> +               void *free1_start = asan_ptr;
> >> +               void *free1_end = aligned_start;
> >> +               void *free2_start = RTE_PTR_ADD(aligned_start,
> >> aligned_len);
> >> +               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
> >> +
> >> +               if (free1_start < free1_end)
> >> +                       asan_set_freezone(free1_start,
> >> +                               RTE_PTR_DIFF(free1_end, free1_start));
> >> +               if (free2_start < free2_end)
> >> +                       asan_set_freezone(free2_start,
> >> +                               RTE_PTR_DIFF(free2_end, free2_start));
> >> +       }
> >>
> >>          rte_spinlock_unlock(&(heap->lock));
> >>          return ret;
> >>
> >
> > Something like that, yes. I will have to think through this a bit more,
> > especially in light of your func_reentrancy splat :)
> >
>
> So, the reason splat in func_reentrancy test happens is as follows: the
> above patch is sorta correct (i have a different one but does the same
> thing), but incomplete. What happens then is when we add new memory, we
> are integrating it into our existing malloc heap, which triggers
> `malloc_elem_join_adjacent_free()` which will trigger a write into old
> header space being merged, which may be marked as "freed". So, again we
> are hit with our internal allocator messing with ASan.

I ended up with the same conclusion.
Thanks for confirming.


>
> To properly fix this is to answer the following question: what is the
> goal of having ASan support in DPDK? Is it there to catch bugs *in the
> allocator*, or can we just trust that our allocator code is correct, and
> only concern ourselves with user-allocated areas of the code? Because it

The best would be to handle both.
I don't think clang disables ASan for the instrumentations on malloc.


> seems like the best way to address this issue would be to just avoid
> triggering ASan checks for certain allocator-internal actions: this way,
> we don't need to care what allocator itself does, just what user code
> does. As in, IIRC there was a compiler attribute that disables ASan
> checks for a specific function: perhaps we could just wrap certain
> access in that and be done with it?
>
> What do you think?

It is tempting because it is the easiest way to avoid the issue.
Though, by waiving those checks in the allocator, does it leave the
ASan shadow in a consistent state?
Burakov, Anatoly April 26, 2022, 4:07 p.m. UTC | #7
On 26-Apr-22 3:15 PM, David Marchand wrote:
> On Tue, Apr 26, 2022 at 2:54 PM Burakov, Anatoly
> <anatoly.burakov@intel.com> wrote:
>>>> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
>>>>
>>>>           rte_mcfg_mem_write_unlock();
>>>>    free_unlock:
>>>> -       /* Poison memory range if belonging to some still mapped
>>>> pages. */
>>>> -       if (!unmapped_pages)
>>>> +       if (!unmapped_pages) {
>>>>                   asan_set_freezone(asan_ptr, asan_data_len);
>>>> +       } else {
>>>> +               /*
>>>> +                * We may be in a situation where we unmapped pages
>>>> like this:
>>>> +                * malloc header | free space | unmapped space | free
>>>> space | malloc header
>>>> +                */
>>>> +               void *free1_start = asan_ptr;
>>>> +               void *free1_end = aligned_start;
>>>> +               void *free2_start = RTE_PTR_ADD(aligned_start,
>>>> aligned_len);
>>>> +               void *free2_end = RTE_PTR_ADD(asan_ptr, asan_data_len);
>>>> +
>>>> +               if (free1_start < free1_end)
>>>> +                       asan_set_freezone(free1_start,
>>>> +                               RTE_PTR_DIFF(free1_end, free1_start));
>>>> +               if (free2_start < free2_end)
>>>> +                       asan_set_freezone(free2_start,
>>>> +                               RTE_PTR_DIFF(free2_end, free2_start));
>>>> +       }
>>>>
>>>>           rte_spinlock_unlock(&(heap->lock));
>>>>           return ret;
>>>>
>>>
>>> Something like that, yes. I will have to think through this a bit more,
>>> especially in light of your func_reentrancy splat :)
>>>
>>
>> So, the reason splat in func_reentrancy test happens is as follows: the
>> above patch is sorta correct (i have a different one but does the same
>> thing), but incomplete. What happens then is when we add new memory, we
>> are integrating it into our existing malloc heap, which triggers
>> `malloc_elem_join_adjacent_free()` which will trigger a write into old
>> header space being merged, which may be marked as "freed". So, again we
>> are hit with our internal allocator messing with ASan.
> 
> I ended up with the same conclusion.
> Thanks for confirming.
> 
> 
>>
>> To properly fix this is to answer the following question: what is the
>> goal of having ASan support in DPDK? Is it there to catch bugs *in the
>> allocator*, or can we just trust that our allocator code is correct, and
>> only concern ourselves with user-allocated areas of the code? Because it
> 
> The best would be to handle both.
> I don't think clang disables ASan for the instrumentations on malloc.

I've actually prototyped these changes a bit. We use memset in a few 
places, and that one can't be disabled as far as i can tell (not without 
blacklisting memset for entire DPDK).

> 
> 
>> seems like the best way to address this issue would be to just avoid
>> triggering ASan checks for certain allocator-internal actions: this way,
>> we don't need to care what allocator itself does, just what user code
>> does. As in, IIRC there was a compiler attribute that disables ASan
>> checks for a specific function: perhaps we could just wrap certain
>> access in that and be done with it?
>>
>> What do you think?
> 
> It is tempting because it is the easiest way to avoid the issue.
> Though, by waiving those checks in the allocator, does it leave the
> ASan shadow in a consistent state?
> 

The "consistent state" is kinda difficult to achieve because there is no 
"default" state for memory - sometimes it comes as available (0x00), 
sometimes it is marked as already freed (0xFF). So, coming into a malloc 
function, we don't know whether the memory we're about to mess with is 
0x00 or 0xFF.

What we could do is mark every malloc header with 0xFF regardless of its 
status, and leave the rest to "regular" zoning. This would be strange 
from ASan's point of view (because we're marking memory as "freed" when 
it wasn't ever allocated), but at least this would be consistent :D
Burakov, Anatoly April 27, 2022, 3:32 p.m. UTC | #8
On 26-Apr-22 5:07 PM, Burakov, Anatoly wrote:
> On 26-Apr-22 3:15 PM, David Marchand wrote:
>> On Tue, Apr 26, 2022 at 2:54 PM Burakov, Anatoly
>> <anatoly.burakov@intel.com> wrote:
>>>>> @@ -1040,9 +1040,25 @@ malloc_heap_free(struct malloc_elem *elem)
>>>>>
>>>>>           rte_mcfg_mem_write_unlock();
>>>>>    free_unlock:
>>>>> -       /* Poison memory range if belonging to some still mapped
>>>>> pages. */
>>>>> -       if (!unmapped_pages)
>>>>> +       if (!unmapped_pages) {
>>>>>                   asan_set_freezone(asan_ptr, asan_data_len);
>>>>> +       } else {
>>>>> +               /*
>>>>> +                * We may be in a situation where we unmapped pages
>>>>> like this:
>>>>> +                * malloc header | free space | unmapped space | free
>>>>> space | malloc header
>>>>> +                */
>>>>> +               void *free1_start = asan_ptr;
>>>>> +               void *free1_end = aligned_start;
>>>>> +               void *free2_start = RTE_PTR_ADD(aligned_start,
>>>>> aligned_len);
>>>>> +               void *free2_end = RTE_PTR_ADD(asan_ptr, 
>>>>> asan_data_len);
>>>>> +
>>>>> +               if (free1_start < free1_end)
>>>>> +                       asan_set_freezone(free1_start,
>>>>> +                               RTE_PTR_DIFF(free1_end, free1_start));
>>>>> +               if (free2_start < free2_end)
>>>>> +                       asan_set_freezone(free2_start,
>>>>> +                               RTE_PTR_DIFF(free2_end, free2_start));
>>>>> +       }
>>>>>
>>>>>           rte_spinlock_unlock(&(heap->lock));
>>>>>           return ret;
>>>>>
>>>>
>>>> Something like that, yes. I will have to think through this a bit more,
>>>> especially in light of your func_reentrancy splat :)
>>>>
>>>
>>> So, the reason splat in func_reentrancy test happens is as follows: the
>>> above patch is sorta correct (i have a different one but does the same
>>> thing), but incomplete. What happens then is when we add new memory, we
>>> are integrating it into our existing malloc heap, which triggers
>>> `malloc_elem_join_adjacent_free()` which will trigger a write into old
>>> header space being merged, which may be marked as "freed". So, again we
>>> are hit with our internal allocator messing with ASan.
>>
>> I ended up with the same conclusion.
>> Thanks for confirming.
>>
>>
>>>
>>> To properly fix this is to answer the following question: what is the
>>> goal of having ASan support in DPDK? Is it there to catch bugs *in the
>>> allocator*, or can we just trust that our allocator code is correct, and
>>> only concern ourselves with user-allocated areas of the code? Because it
>>
>> The best would be to handle both.
>> I don't think clang disables ASan for the instrumentations on malloc.
> 
> I've actually prototyped these changes a bit. We use memset in a few 
> places, and that one can't be disabled as far as i can tell (not without 
> blacklisting memset for entire DPDK).
> 
>>
>>
>>> seems like the best way to address this issue would be to just avoid
>>> triggering ASan checks for certain allocator-internal actions: this way,
>>> we don't need to care what allocator itself does, just what user code
>>> does. As in, IIRC there was a compiler attribute that disables ASan
>>> checks for a specific function: perhaps we could just wrap certain
>>> access in that and be done with it?
>>>
>>> What do you think?
>>
>> It is tempting because it is the easiest way to avoid the issue.
>> Though, by waiving those checks in the allocator, does it leave the
>> ASan shadow in a consistent state?
>>
> 
> The "consistent state" is kinda difficult to achieve because there is no 
> "default" state for memory - sometimes it comes as available (0x00), 
> sometimes it is marked as already freed (0xFF). So, coming into a malloc 
> function, we don't know whether the memory we're about to mess with is 
> 0x00 or 0xFF.
> 
> What we could do is mark every malloc header with 0xFF regardless of its 
> status, and leave the rest to "regular" zoning. This would be strange 
> from ASan's point of view (because we're marking memory as "freed" when 
> it wasn't ever allocated), but at least this would be consistent :D
> 

I've been prototyping a solution for this, but I keep bumping into our 
dual usage of ASan: ASan doesn't differentiate between 
allocator-internal accesses, and user code accesses. Therefore, we can't 
either, so either we start marking areas as "accessible" even when they 
shouldn't be (such as unallocated areas that correspond to malloc 
headers), or we only use ASan to mark user-available areas and forego 
its usage inside the allocator entirely.

Right now, the best I can think of is the combination of approaches 
discussed earlier: that is, we mark all malloc element header areas as 
"available" unconditionally (thereby sacrificing part of the protection 
ASan provides us - because we can't prevent ASan from complaining about 
accesses from inside the allocator without losing our ability to detect 
cases where user accidentally accesses a malloc element), and we also 
mark unmapped memory as "available" (because writing to it will trigger 
a fault anyway).

I haven't yet figured out the cleanest solution (we miss asan zoning for 
headers somewhere), but at least i got func reentrancy test to pass :D
diff mbox series

Patch

diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
index 228f178418..b859003722 100644
--- a/lib/eal/common/malloc_elem.h
+++ b/lib/eal/common/malloc_elem.h
@@ -272,6 +272,10 @@  old_malloc_size(struct malloc_elem *elem)
 
 #else /* !RTE_MALLOC_ASAN */
 
+static inline void
+asan_set_zone(void *ptr __rte_unused, size_t len __rte_unused,
+	uint32_t val __rte_unused) { }
+
 static inline void
 asan_set_freezone(void *ptr __rte_unused, size_t size __rte_unused) { }
 
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 6c572b6f2c..5913d9f862 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -860,6 +860,7 @@  malloc_heap_free(struct malloc_elem *elem)
 	size_t len, aligned_len, page_sz;
 	struct rte_memseg_list *msl;
 	unsigned int i, n_segs, before_space, after_space;
+	bool unmapped_pages = false;
 	int ret;
 	const struct internal_config *internal_conf =
 		eal_get_internal_configuration();
@@ -999,6 +1000,13 @@  malloc_heap_free(struct malloc_elem *elem)
 
 		/* don't care if any of this fails */
 		malloc_heap_free_pages(aligned_start, aligned_len);
+		/*
+		 * Clear any poisoning in ASan for the associated pages so that
+		 * next time EAL maps those pages, the allocator can access
+		 * them.
+		 */
+		asan_set_zone(aligned_start, aligned_len, 0x00);
+		unmapped_pages = true;
 
 		request_sync();
 	} else {
@@ -1032,7 +1040,9 @@  malloc_heap_free(struct malloc_elem *elem)
 
 	rte_mcfg_mem_write_unlock();
 free_unlock:
-	asan_set_freezone(asan_ptr, asan_data_len);
+	/* Poison memory range if belonging to some still mapped pages. */
+	if (!unmapped_pages)
+		asan_set_freezone(asan_ptr, asan_data_len);
 
 	rte_spinlock_unlock(&(heap->lock));
 	return ret;