[v3,3/3] lib/lpm: memory orderings to avoid race conditions for v20

Message ID 20190627093751.7746-3-ruifeng.wang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v3,1/3] lib/lpm: not inline unnecessary functions |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Ruifeng Wang June 27, 2019, 9:37 a.m. UTC
  When a tbl8 group is getting attached to a tbl24 entry, lookup
might fail even though the entry is configured in the table.

For ex: consider a LPM table configured with 10.10.10.1/24.
When a new entry 10.10.10.32/28 is being added, a new tbl8
group is allocated and tbl24 entry is changed to point to
the tbl8 group. If the tbl24 entry is written without the tbl8
group entries updated, a lookup on 10.10.10.9 will return
failure.

Correct memory orderings are required to ensure that the
store to tbl24 does not happen before the stores to tbl8 group
entries complete.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
v3: no changes
v2: fixed clang building issue by supplying alignment attribute.
v1: initail version

 lib/librte_lpm/rte_lpm.c | 31 ++++++++++++++++++++++++-------
 lib/librte_lpm/rte_lpm.h |  4 ++--
 2 files changed, 26 insertions(+), 9 deletions(-)
  

Comments

Vladimir Medvedkin June 28, 2019, 1:33 p.m. UTC | #1
Hi Wang,

On 27/06/2019 10:37, Ruifeng Wang wrote:
> When a tbl8 group is getting attached to a tbl24 entry, lookup
> might fail even though the entry is configured in the table.
>
> For ex: consider a LPM table configured with 10.10.10.1/24.
> When a new entry 10.10.10.32/28 is being added, a new tbl8
> group is allocated and tbl24 entry is changed to point to
> the tbl8 group. If the tbl24 entry is written without the tbl8
> group entries updated, a lookup on 10.10.10.9 will return
> failure.
>
> Correct memory orderings are required to ensure that the
> store to tbl24 does not happen before the stores to tbl8 group
> entries complete.
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> v3: no changes
> v2: fixed clang building issue by supplying alignment attribute.
> v1: initail version
>
>   lib/librte_lpm/rte_lpm.c | 31 ++++++++++++++++++++++++-------
>   lib/librte_lpm/rte_lpm.h |  4 ++--
>   2 files changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c
> index fabd13fb0..5f8d494ae 100644
> --- a/lib/librte_lpm/rte_lpm.c
> +++ b/lib/librte_lpm/rte_lpm.c
> @@ -737,7 +737,8 @@ add_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth,
>   			/* Setting tbl24 entry in one go to avoid race
>   			 * conditions
>   			 */
> -			lpm->tbl24[i] = new_tbl24_entry;
> +			__atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
> +					__ATOMIC_RELEASE);
>   
>   			continue;
>   		}
> @@ -892,7 +893,8 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
>   			.depth = 0,
>   		};
>   
> -		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> +		__atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
> +				__ATOMIC_RELEASE);
>   
>   	} /* If valid entry but not extended calculate the index into Table8. */
>   	else if (lpm->tbl24[tbl24_index].valid_group == 0) {
> @@ -938,7 +940,8 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
>   				.depth = 0,
>   		};
>   
> -		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> +		__atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
> +				__ATOMIC_RELEASE);
>   
>   	} else { /*
>   		* If it is valid, extended entry calculate the index into tbl8.
> @@ -1320,7 +1323,14 @@ delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
>   
>   			if (lpm->tbl24[i].valid_group == 0 &&
>   					lpm->tbl24[i].depth <= depth) {
> -				lpm->tbl24[i].valid = INVALID;
> +				struct rte_lpm_tbl_entry_v20 zero_tbl_entry = {
> +						.valid = INVALID,
> +						.depth = 0,
> +						.valid_group = 0,
> +					};
> +					zero_tbl_entry.next_hop = 0;

Please use the same var name in both v20 and v1604 (zero_tbl24_entry). 
The same is for struct initialization. In 1604 you use:

struct rte_lpm_tbl_entry zero_tbl24_entry = {0};

> +				__atomic_store(&lpm->tbl24[i], &zero_tbl_entry,
> +						__ATOMIC_RELEASE);
>   			} else if (lpm->tbl24[i].valid_group == 1) {
>   				/*
>   				 * If TBL24 entry is extended, then there has
> @@ -1365,7 +1375,8 @@ delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
>   
>   			if (lpm->tbl24[i].valid_group == 0 &&
>   					lpm->tbl24[i].depth <= depth) {
> -				lpm->tbl24[i] = new_tbl24_entry;
> +				__atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
> +						__ATOMIC_RELEASE);
>   			} else  if (lpm->tbl24[i].valid_group == 1) {
>   				/*
>   				 * If TBL24 entry is extended, then there has
> @@ -1647,8 +1658,11 @@ delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
>   	tbl8_recycle_index = tbl8_recycle_check_v20(lpm->tbl8, tbl8_group_start);
>   
>   	if (tbl8_recycle_index == -EINVAL) {
> -		/* Set tbl24 before freeing tbl8 to avoid race condition. */
> +		/* Set tbl24 before freeing tbl8 to avoid race condition.
> +		 * Prevent the free of the tbl8 group from hoisting.
> +		 */
>   		lpm->tbl24[tbl24_index].valid = 0;
> +		__atomic_thread_fence(__ATOMIC_RELEASE);
>   		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
>   	} else if (tbl8_recycle_index > -1) {
>   		/* Update tbl24 entry. */
> @@ -1659,8 +1673,11 @@ delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
>   			.depth = lpm->tbl8[tbl8_recycle_index].depth,
>   		};
>   
> -		/* Set tbl24 before freeing tbl8 to avoid race condition. */
> +		/* Set tbl24 before freeing tbl8 to avoid race condition.
> +		 * Prevent the free of the tbl8 group from hoisting.
> +		 */
>   		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> +		__atomic_thread_fence(__ATOMIC_RELEASE);
>   		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
>   	}
>   
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index 6f5704c5c..98c70ecbe 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -88,7 +88,7 @@ struct rte_lpm_tbl_entry_v20 {
>   	 */
>   	uint8_t valid_group :1;
>   	uint8_t depth       :6; /**< Rule depth. */
> -};
> +} __rte_aligned(2);

I think it is better to __rte_aligned(sizeof(uint16_t)).

>   
>   __extension__
>   struct rte_lpm_tbl_entry {
> @@ -121,7 +121,7 @@ struct rte_lpm_tbl_entry_v20 {
>   		uint8_t group_idx;
>   		uint8_t next_hop;
>   	};
> -};
> +} __rte_aligned(2);
>   
>   __extension__
>   struct rte_lpm_tbl_entry {

As a general remark consider writing all of the tbl entries including 
tbl8 with atomic_store. Now "lpm->tbl8[j] = new_tbl8_entry;" is looks like

      1e9:       44 88 9c 47 40 01 00    mov 
%r11b,0x2000140(%rdi,%rax,2) <-write first byte
      1f0:       02
      1f1:       48 83 c0 01             add    $0x1,%rax
      1f5:       42 88 8c 47 41 01 00    mov %cl,0x2000141(%rdi,%r8,2) 
<-write second byte
      1fc:       02

This may cause an incorrect nexthop to be returned. If the byte with 
valid flag is updated first, the old(and maybe invalid) next hop could 
be returned.

Please evaluate performance drop after.
  
Honnappa Nagarahalli June 29, 2019, 5:35 p.m. UTC | #2
<snip>

> 
> As a general remark consider writing all of the tbl entries including
> tbl8 with atomic_store. Now "lpm->tbl8[j] = new_tbl8_entry;" is looks like
> 
>       1e9:       44 88 9c 47 40 01 00    mov
> %r11b,0x2000140(%rdi,%rax,2) <-write first byte
>       1f0:       02
>       1f1:       48 83 c0 01             add    $0x1,%rax
>       1f5:       42 88 8c 47 41 01 00    mov %cl,0x2000141(%rdi,%r8,2) <-write
> second byte
>       1fc:       02
> 
> This may cause an incorrect nexthop to be returned. If the byte with valid flag
> is updated first, the old(and maybe invalid) next hop could be returned.
+1

It is surprising that the compiler is not generating a single 32b store. As you mentioned 'relaxed' __atomic_store_n should be good.

> 
> Please evaluate performance drop after.
> 
> --
> Regards,
> Vladimir
  
Ruifeng Wang July 1, 2019, 7:08 a.m. UTC | #3
Hi Vladimir,

> -----Original Message-----
> From: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>
> Sent: Friday, June 28, 2019 21:34
> To: Ruifeng Wang (Arm Technology China) <Ruifeng.Wang@arm.com>;
> bruce.richardson@intel.com
> Cc: dev@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v3 3/3] lib/lpm: memory orderings to avoid race
> conditions for v20
> 
> Hi Wang,
> 
> On 27/06/2019 10:37, Ruifeng Wang wrote:
> > When a tbl8 group is getting attached to a tbl24 entry, lookup might
> > fail even though the entry is configured in the table.
> >
> > For ex: consider a LPM table configured with 10.10.10.1/24.
> > When a new entry 10.10.10.32/28 is being added, a new tbl8 group is
> > allocated and tbl24 entry is changed to point to the tbl8 group. If
> > the tbl24 entry is written without the tbl8 group entries updated, a
> > lookup on 10.10.10.9 will return failure.
> >
> > Correct memory orderings are required to ensure that the store to
> > tbl24 does not happen before the stores to tbl8 group entries
> > complete.
> >
> > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> > v3: no changes
> > v2: fixed clang building issue by supplying alignment attribute.
> > v1: initail version
> >
> >   lib/librte_lpm/rte_lpm.c | 31 ++++++++++++++++++++++++-------
> >   lib/librte_lpm/rte_lpm.h |  4 ++--
> >   2 files changed, 26 insertions(+), 9 deletions(-)
> >
> > diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c index
> > fabd13fb0..5f8d494ae 100644
> > --- a/lib/librte_lpm/rte_lpm.c
> > +++ b/lib/librte_lpm/rte_lpm.c
> > @@ -737,7 +737,8 @@ add_depth_small_v20(struct rte_lpm_v20 *lpm,
> uint32_t ip, uint8_t depth,
> >   			/* Setting tbl24 entry in one go to avoid race
> >   			 * conditions
> >   			 */
> > -			lpm->tbl24[i] = new_tbl24_entry;
> > +			__atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
> > +					__ATOMIC_RELEASE);
> >
> >   			continue;
> >   		}
> > @@ -892,7 +893,8 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm,
> uint32_t ip_masked, uint8_t depth,
> >   			.depth = 0,
> >   		};
> >
> > -		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> > +		__atomic_store(&lpm->tbl24[tbl24_index],
> &new_tbl24_entry,
> > +				__ATOMIC_RELEASE);
> >
> >   	} /* If valid entry but not extended calculate the index into Table8. */
> >   	else if (lpm->tbl24[tbl24_index].valid_group == 0) { @@ -938,7
> > +940,8 @@ add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t
> ip_masked, uint8_t depth,
> >   				.depth = 0,
> >   		};
> >
> > -		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> > +		__atomic_store(&lpm->tbl24[tbl24_index],
> &new_tbl24_entry,
> > +				__ATOMIC_RELEASE);
> >
> >   	} else { /*
> >   		* If it is valid, extended entry calculate the index into tbl8.
> > @@ -1320,7 +1323,14 @@ delete_depth_small_v20(struct rte_lpm_v20
> *lpm,
> > uint32_t ip_masked,
> >
> >   			if (lpm->tbl24[i].valid_group == 0 &&
> >   					lpm->tbl24[i].depth <= depth) {
> > -				lpm->tbl24[i].valid = INVALID;
> > +				struct rte_lpm_tbl_entry_v20 zero_tbl_entry
> = {
> > +						.valid = INVALID,
> > +						.depth = 0,
> > +						.valid_group = 0,
> > +					};
> > +					zero_tbl_entry.next_hop = 0;
> 
> Please use the same var name in both v20 and v1604 (zero_tbl24_entry).
> The same is for struct initialization. In 1604 you use:
> 
> struct rte_lpm_tbl_entry zero_tbl24_entry = {0};
> 
Accept on the var name.
However, similar struct initialization as in v1604 will cause meson build failure.
This should be related to struct with union members been used for both big-endian and little-endian systems.
Referred to commit: e13676bfe7.

> > +				__atomic_store(&lpm->tbl24[i],
> &zero_tbl_entry,
> > +						__ATOMIC_RELEASE);
> >   			} else if (lpm->tbl24[i].valid_group == 1) {
> >   				/*
> >   				 * If TBL24 entry is extended, then there has
> @@ -1365,7 +1375,8
> > @@ delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t
> ip_masked,
> >
> >   			if (lpm->tbl24[i].valid_group == 0 &&
> >   					lpm->tbl24[i].depth <= depth) {
> > -				lpm->tbl24[i] = new_tbl24_entry;
> > +				__atomic_store(&lpm->tbl24[i],
> &new_tbl24_entry,
> > +						__ATOMIC_RELEASE);
> >   			} else  if (lpm->tbl24[i].valid_group == 1) {
> >   				/*
> >   				 * If TBL24 entry is extended, then there has
> @@ -1647,8
> > +1658,11 @@ delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t
> ip_masked,
> >   	tbl8_recycle_index = tbl8_recycle_check_v20(lpm->tbl8,
> > tbl8_group_start);
> >
> >   	if (tbl8_recycle_index == -EINVAL) {
> > -		/* Set tbl24 before freeing tbl8 to avoid race condition. */
> > +		/* Set tbl24 before freeing tbl8 to avoid race condition.
> > +		 * Prevent the free of the tbl8 group from hoisting.
> > +		 */
> >   		lpm->tbl24[tbl24_index].valid = 0;
> > +		__atomic_thread_fence(__ATOMIC_RELEASE);
> >   		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
> >   	} else if (tbl8_recycle_index > -1) {
> >   		/* Update tbl24 entry. */
> > @@ -1659,8 +1673,11 @@ delete_depth_big_v20(struct rte_lpm_v20 *lpm,
> uint32_t ip_masked,
> >   			.depth = lpm->tbl8[tbl8_recycle_index].depth,
> >   		};
> >
> > -		/* Set tbl24 before freeing tbl8 to avoid race condition. */
> > +		/* Set tbl24 before freeing tbl8 to avoid race condition.
> > +		 * Prevent the free of the tbl8 group from hoisting.
> > +		 */
> >   		lpm->tbl24[tbl24_index] = new_tbl24_entry;
> > +		__atomic_thread_fence(__ATOMIC_RELEASE);
> >   		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
> >   	}
> >
> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index
> > 6f5704c5c..98c70ecbe 100644
> > --- a/lib/librte_lpm/rte_lpm.h
> > +++ b/lib/librte_lpm/rte_lpm.h
> > @@ -88,7 +88,7 @@ struct rte_lpm_tbl_entry_v20 {
> >   	 */
> >   	uint8_t valid_group :1;
> >   	uint8_t depth       :6; /**< Rule depth. */
> > -};
> > +} __rte_aligned(2);
> 
> I think it is better to __rte_aligned(sizeof(uint16_t)).
> 
Will include this change in next version.

> >
> >   __extension__
> >   struct rte_lpm_tbl_entry {
> > @@ -121,7 +121,7 @@ struct rte_lpm_tbl_entry_v20 {
> >   		uint8_t group_idx;
> >   		uint8_t next_hop;
> >   	};
> > -};
> > +} __rte_aligned(2);
> >
> >   __extension__
> >   struct rte_lpm_tbl_entry {
> 
> As a general remark consider writing all of the tbl entries including
> tbl8 with atomic_store. Now "lpm->tbl8[j] = new_tbl8_entry;" is looks like
> 
>       1e9:       44 88 9c 47 40 01 00    mov
> %r11b,0x2000140(%rdi,%rax,2) <-write first byte
>       1f0:       02
>       1f1:       48 83 c0 01             add    $0x1,%rax
>       1f5:       42 88 8c 47 41 01 00    mov %cl,0x2000141(%rdi,%r8,2) <-write
> second byte
>       1fc:       02
> 
> This may cause an incorrect nexthop to be returned. If the byte with valid
> flag is updated first, the old(and maybe invalid) next hop could be returned.
> 
> Please evaluate performance drop after.
> 
Thanks for looking into this. It is a surprise compiler doesn't generate atomic store for this data type.
Will include the change in next version.

> --
> Regards,
> Vladimir
  
Alex Kiselev July 5, 2019, 1:45 p.m. UTC | #4
Hi,

> <snip>
>
> >
> > As a general remark consider writing all of the tbl entries including
> > tbl8 with atomic_store. Now "lpm->tbl8[j] = new_tbl8_entry;" is looks like
> >
> >       1e9:       44 88 9c 47 40 01 00    mov
> > %r11b,0x2000140(%rdi,%rax,2) <-write first byte
> >       1f0:       02
> >       1f1:       48 83 c0 01             add    $0x1,%rax
> >       1f5:       42 88 8c 47 41 01 00    mov %cl,0x2000141(%rdi,%r8,2) <-write
> > second byte
> >       1fc:       02
> >
> > This may cause an incorrect nexthop to be returned. If the byte with valid flag
> > is updated first, the old(and maybe invalid) next hop could be returned.
> +1
>
> It is surprising that the compiler is not generating a single 32b store. As you mentioned 'relaxed' __atomic_store_n should be good.

Am I right that x86 platform is not affected by the bug since
store-store could not be reordered on x86?
  
Vladimir Medvedkin July 5, 2019, 4:56 p.m. UTC | #5
Hi Alex,

On 05/07/2019 14:45, Alex Kiselev wrote:
> Hi,
>
>> <snip>
>>
>>> As a general remark consider writing all of the tbl entries including
>>> tbl8 with atomic_store. Now "lpm->tbl8[j] = new_tbl8_entry;" is looks like
>>>
>>>        1e9:       44 88 9c 47 40 01 00    mov
>>> %r11b,0x2000140(%rdi,%rax,2) <-write first byte
>>>        1f0:       02
>>>        1f1:       48 83 c0 01             add    $0x1,%rax
>>>        1f5:       42 88 8c 47 41 01 00    mov %cl,0x2000141(%rdi,%r8,2) <-write
>>> second byte
>>>        1fc:       02
>>>
>>> This may cause an incorrect nexthop to be returned. If the byte with valid flag
>>> is updated first, the old(and maybe invalid) next hop could be returned.
>> +1
>>
>> It is surprising that the compiler is not generating a single 32b store. As you mentioned 'relaxed' __atomic_store_n should be good.
> Am I right that x86 platform is not affected by the bug since
> store-store could not be reordered on x86?
You right, x86 shouldn't be affected. If I understand asm correctly 
nexthop is written first for both _v20 and _v1604. Memory stores are 
retired in order.
  

Patch

diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c
index fabd13fb0..5f8d494ae 100644
--- a/lib/librte_lpm/rte_lpm.c
+++ b/lib/librte_lpm/rte_lpm.c
@@ -737,7 +737,8 @@  add_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth,
 			/* Setting tbl24 entry in one go to avoid race
 			 * conditions
 			 */
-			lpm->tbl24[i] = new_tbl24_entry;
+			__atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
+					__ATOMIC_RELEASE);
 
 			continue;
 		}
@@ -892,7 +893,8 @@  add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
 			.depth = 0,
 		};
 
-		lpm->tbl24[tbl24_index] = new_tbl24_entry;
+		__atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
+				__ATOMIC_RELEASE);
 
 	} /* If valid entry but not extended calculate the index into Table8. */
 	else if (lpm->tbl24[tbl24_index].valid_group == 0) {
@@ -938,7 +940,8 @@  add_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked, uint8_t depth,
 				.depth = 0,
 		};
 
-		lpm->tbl24[tbl24_index] = new_tbl24_entry;
+		__atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
+				__ATOMIC_RELEASE);
 
 	} else { /*
 		* If it is valid, extended entry calculate the index into tbl8.
@@ -1320,7 +1323,14 @@  delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
 
 			if (lpm->tbl24[i].valid_group == 0 &&
 					lpm->tbl24[i].depth <= depth) {
-				lpm->tbl24[i].valid = INVALID;
+				struct rte_lpm_tbl_entry_v20 zero_tbl_entry = {
+						.valid = INVALID,
+						.depth = 0,
+						.valid_group = 0,
+					};
+					zero_tbl_entry.next_hop = 0;
+				__atomic_store(&lpm->tbl24[i], &zero_tbl_entry,
+						__ATOMIC_RELEASE);
 			} else if (lpm->tbl24[i].valid_group == 1) {
 				/*
 				 * If TBL24 entry is extended, then there has
@@ -1365,7 +1375,8 @@  delete_depth_small_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
 
 			if (lpm->tbl24[i].valid_group == 0 &&
 					lpm->tbl24[i].depth <= depth) {
-				lpm->tbl24[i] = new_tbl24_entry;
+				__atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
+						__ATOMIC_RELEASE);
 			} else  if (lpm->tbl24[i].valid_group == 1) {
 				/*
 				 * If TBL24 entry is extended, then there has
@@ -1647,8 +1658,11 @@  delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
 	tbl8_recycle_index = tbl8_recycle_check_v20(lpm->tbl8, tbl8_group_start);
 
 	if (tbl8_recycle_index == -EINVAL) {
-		/* Set tbl24 before freeing tbl8 to avoid race condition. */
+		/* Set tbl24 before freeing tbl8 to avoid race condition.
+		 * Prevent the free of the tbl8 group from hoisting.
+		 */
 		lpm->tbl24[tbl24_index].valid = 0;
+		__atomic_thread_fence(__ATOMIC_RELEASE);
 		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
 	} else if (tbl8_recycle_index > -1) {
 		/* Update tbl24 entry. */
@@ -1659,8 +1673,11 @@  delete_depth_big_v20(struct rte_lpm_v20 *lpm, uint32_t ip_masked,
 			.depth = lpm->tbl8[tbl8_recycle_index].depth,
 		};
 
-		/* Set tbl24 before freeing tbl8 to avoid race condition. */
+		/* Set tbl24 before freeing tbl8 to avoid race condition.
+		 * Prevent the free of the tbl8 group from hoisting.
+		 */
 		lpm->tbl24[tbl24_index] = new_tbl24_entry;
+		__atomic_thread_fence(__ATOMIC_RELEASE);
 		tbl8_free_v20(lpm->tbl8, tbl8_group_start);
 	}
 
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index 6f5704c5c..98c70ecbe 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -88,7 +88,7 @@  struct rte_lpm_tbl_entry_v20 {
 	 */
 	uint8_t valid_group :1;
 	uint8_t depth       :6; /**< Rule depth. */
-};
+} __rte_aligned(2);
 
 __extension__
 struct rte_lpm_tbl_entry {
@@ -121,7 +121,7 @@  struct rte_lpm_tbl_entry_v20 {
 		uint8_t group_idx;
 		uint8_t next_hop;
 	};
-};
+} __rte_aligned(2);
 
 __extension__
 struct rte_lpm_tbl_entry {