[v4,1/5] ring: add 64-bit headtail structure

Message ID 20190128181407.32739-2-gage.eads@intel.com
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series
  • Add non-blocking ring
Related show

Checks

Context Check Description
ci/Performance-Testing fail build patch failure
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Eads, Gage Jan. 28, 2019, 6:14 p.m.
64-bit head and tail index widths greatly increases the time it takes for
them to wrap-around (with current CPU speeds, it won't happen within the
author's lifetime). This is fundamental to avoiding the ABA problem -- in
which a thread mistakes reading the same tail index in two accesses to mean
that the ring was not modified in the intervening time -- in the upcoming
non-blocking ring implementation. Using a 64-bit index makes the
possibility of this occurring effectively zero.

This commit places the new producer and consumer structures in the same
location in struct rte_ring as their 32-bit counterparts. Since the 32-bit
versions are padded out to a cache line, there is space for the new
structure without affecting the layout of struct rte_ring. Thus, the ABI is
preserved.

Signed-off-by: Gage Eads <gage.eads@intel.com>
---
 lib/librte_ring/rte_ring.h         |  23 +++++-
 lib/librte_ring/rte_ring_c11_mem.h | 153 +++++++++++++++++++++++++++++++++++++
 lib/librte_ring/rte_ring_generic.h | 139 +++++++++++++++++++++++++++++++++
 3 files changed, 312 insertions(+), 3 deletions(-)

Comments

Ola Liljedahl Jan. 29, 2019, 12:56 p.m. | #1
On Mon, 2019-01-28 at 12:14 -0600, Gage Eads wrote:
> 64-bit head and tail index widths greatly increases the time it takes for
> them to wrap-around (with current CPU speeds, it won't happen within the
> author's lifetime). This is fundamental to avoiding the ABA problem -- in
> which a thread mistakes reading the same tail index in two accesses to mean
> that the ring was not modified in the intervening time -- in the upcoming
> non-blocking ring implementation. Using a 64-bit index makes the
> possibility of this occurring effectively zero.
Just an observation.
The following invariant holds (using ring_size instead of mask):
∀ index: ring[index % ring_size].index % ring_size == index % ring_size
i.e. the N (N=log2 ring size) lsb of ring[].index will always be the same (for a
specific slot) so serve no purpose.

This means we don't have to store the whole index in each slot, it is enough to
store "index / ring_size" (which I call the lap counter). This could be useful
for an implementation for 32-bit platforms which support 64-bit CAS (to write
the slot ptr & index (lap counter) atomically) and uses 64-bit head & tail
indexes (to avoid the quick wrap around you would have with 32-bit ring
indexes).

So
ring[index % ring_size].lap = index / ring_size;

An implementation could of course use bitwise-and instead of modulo and bitwise-
right shift instead of division. The 2-logaritm of ring_size should also be pre-
calcucated and stored in the ring buffer metadata.

-- Ola

> 
> This commit places the new producer and consumer structures in the same
> location in struct rte_ring as their 32-bit counterparts. Since the 32-bit
> versions are padded out to a cache line, there is space for the new
> structure without affecting the layout of struct rte_ring. Thus, the ABI is
> preserved.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  lib/librte_ring/rte_ring.h         |  23 +++++-
>  lib/librte_ring/rte_ring_c11_mem.h | 153
> +++++++++++++++++++++++++++++++++++++
>  lib/librte_ring/rte_ring_generic.h | 139 +++++++++++++++++++++++++++++++++
>  3 files changed, 312 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> index af5444a9f..00dfb5b85 100644
> --- a/lib/librte_ring/rte_ring.h
> +++ b/lib/librte_ring/rte_ring.h
> @@ -1,6 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   *
> - * Copyright (c) 2010-2017 Intel Corporation
> + * Copyright (c) 2010-2019 Intel Corporation
>   * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org
>   * All rights reserved.
>   * Derived from FreeBSD's bufring.h
> @@ -70,6 +70,15 @@ struct rte_ring_headtail {
>  	uint32_t single;         /**< True if single prod/cons */
>  };
>  
> +/* 64-bit version of rte_ring_headtail, for use by rings that need to avoid
> + * head/tail wrap-around.
> + */
> +struct rte_ring_headtail_64 {
> +	volatile uint64_t head;  /**< Prod/consumer head. */
> +	volatile uint64_t tail;  /**< Prod/consumer tail. */
> +	uint32_t single;       /**< True if single prod/cons */
> +};
> +
>  /**
>   * An RTE ring structure.
>   *
> @@ -97,11 +106,19 @@ struct rte_ring {
>  	char pad0 __rte_cache_aligned; /**< empty cache line */
>  
>  	/** Ring producer status. */
> -	struct rte_ring_headtail prod __rte_cache_aligned;
> +	RTE_STD_C11
> +	union {
> +		struct rte_ring_headtail prod __rte_cache_aligned;
> +		struct rte_ring_headtail_64 prod_64 __rte_cache_aligned;
> +	};
>  	char pad1 __rte_cache_aligned; /**< empty cache line */
>  
>  	/** Ring consumer status. */
> -	struct rte_ring_headtail cons __rte_cache_aligned;
> +	RTE_STD_C11
> +	union {
> +		struct rte_ring_headtail cons __rte_cache_aligned;
> +		struct rte_ring_headtail_64 cons_64 __rte_cache_aligned;
> +	};
>  	char pad2 __rte_cache_aligned; /**< empty cache line */
>  };
>  
> diff --git a/lib/librte_ring/rte_ring_c11_mem.h
> b/lib/librte_ring/rte_ring_c11_mem.h
> index 0fb73a337..47acd4c7c 100644
> --- a/lib/librte_ring/rte_ring_c11_mem.h
> +++ b/lib/librte_ring/rte_ring_c11_mem.h
> @@ -178,4 +178,157 @@ __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
>  	return n;
>  }
>  
> +/**
> + * @internal This function updates the producer head for enqueue using
> + *	     64-bit head/tail values.
> + *
> + * @param r
> + *   A pointer to the ring structure
> + * @param is_sp
> + *   Indicates whether multi-producer path is needed or not
> + * @param n
> + *   The number of elements we will want to enqueue, i.e. how far should the
> + *   head be moved
> + * @param behavior
> + *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
> + *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
> + * @param old_head
> + *   Returns head value as it was before the move, i.e. where enqueue starts
> + * @param new_head
> + *   Returns the current/new head value i.e. where enqueue finishes
> + * @param free_entries
> + *   Returns the amount of free space in the ring BEFORE head was moved
> + * @return
> + *   Actual number of objects enqueued.
> + *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> + */
> +static __rte_always_inline unsigned int
> +__rte_ring_move_prod_head_64(struct rte_ring *r, unsigned int is_sp,
> +		unsigned int n, enum rte_ring_queue_behavior behavior,
> +		uint64_t *old_head, uint64_t *new_head,
> +		uint32_t *free_entries)
> +{
> +	const uint32_t capacity = r->capacity;
> +	uint64_t cons_tail;
> +	unsigned int max = n;
> +	int success;
> +
> +	*old_head = __atomic_load_n(&r->prod_64.head, __ATOMIC_RELAXED);
> +	do {
> +		/* Reset n to the initial burst count */
> +		n = max;
> +
> +		/* Ensure the head is read before tail */
> +		__atomic_thread_fence(__ATOMIC_ACQUIRE);
> +
> +		/* load-acquire synchronize with store-release of ht->tail
> +		 * in update_tail.
> +		 */
> +		cons_tail = __atomic_load_n(&r->cons_64.tail,
> +					__ATOMIC_ACQUIRE);
> +
> +		/* The subtraction is done between two unsigned 32bits value
> +		 * (the result is always modulo 32 bits even if we have
> +		 * *old_head > cons_tail). So 'free_entries' is always
> between 0
> +		 * and capacity (which is < size).
> +		 */
> +		*free_entries = (capacity + cons_tail - *old_head);
> +
> +		/* check that we have enough room in ring */
> +		if (unlikely(n > *free_entries))
> +			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> +					0 : *free_entries;
> +
> +		if (n == 0)
> +			return 0;
> +
> +		*new_head = *old_head + n;
> +		if (is_sp)
> +			r->prod_64.head = *new_head, success = 1;
> +		else
> +			/* on failure, *old_head is updated */
> +			success = __atomic_compare_exchange_n(&r-
> >prod_64.head,
> +					old_head, *new_head,
> +					0, __ATOMIC_RELAXED,
> +					__ATOMIC_RELAXED);
> +	} while (unlikely(success == 0));
> +	return n;
> +}
> +
> +/**
> + * @internal This function updates the consumer head for dequeue using
> + *	     64-bit head/tail values.
> + *
> + * @param r
> + *   A pointer to the ring structure
> + * @param is_sc
> + *   Indicates whether multi-consumer path is needed or not
> + * @param n
> + *   The number of elements we will want to enqueue, i.e. how far should the
> + *   head be moved
> + * @param behavior
> + *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
> + *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
> + * @param old_head
> + *   Returns head value as it was before the move, i.e. where dequeue starts
> + * @param new_head
> + *   Returns the current/new head value i.e. where dequeue finishes
> + * @param entries
> + *   Returns the number of entries in the ring BEFORE head was moved
> + * @return
> + *   - Actual number of objects dequeued.
> + *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> + */
> +static __rte_always_inline unsigned int
> +__rte_ring_move_cons_head_64(struct rte_ring *r, unsigned int is_sc,
> +		unsigned int n, enum rte_ring_queue_behavior behavior,
> +		uint64_t *old_head, uint64_t *new_head,
> +		uint32_t *entries)
> +{
> +	unsigned int max = n;
> +	uint64_t prod_tail;
> +	int success;
> +
> +	/* move cons.head atomically */
> +	*old_head = __atomic_load_n(&r->cons_64.head, __ATOMIC_RELAXED);
> +	do {
> +		/* Restore n as it may change every loop */
> +		n = max;
> +
> +		/* Ensure the head is read before tail */
> +		__atomic_thread_fence(__ATOMIC_ACQUIRE);
> +
> +		/* this load-acquire synchronize with store-release of ht-
> >tail
> +		 * in update_tail.
> +		 */
> +		prod_tail = __atomic_load_n(&r->prod_64.tail,
> +					__ATOMIC_ACQUIRE);
> +
> +		/* The subtraction is done between two unsigned 32bits value
> +		 * (the result is always modulo 32 bits even if we have
> +		 * cons_head > prod_tail). So 'entries' is always between 0
> +		 * and size(ring)-1.
> +		 */
> +		*entries = (prod_tail - *old_head);
> +
> +		/* Set the actual entries for dequeue */
> +		if (n > *entries)
> +			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 :
> *entries;
> +
> +		if (unlikely(n == 0))
> +			return 0;
> +
> +		*new_head = *old_head + n;
> +		if (is_sc)
> +			r->cons_64.head = *new_head, success = 1;
> +		else
> +			/* on failure, *old_head will be updated */
> +			success = __atomic_compare_exchange_n(&r-
> >cons_64.head,
> +							old_head, *new_head,
> +							0, __ATOMIC_RELAXED,
> +							__ATOMIC_RELAXED);
> +	} while (unlikely(success == 0));
> +	return n;
> +}
> +
>  #endif /* _RTE_RING_C11_MEM_H_ */
> diff --git a/lib/librte_ring/rte_ring_generic.h
> b/lib/librte_ring/rte_ring_generic.h
> index ea7dbe5b9..2158e092a 100644
> --- a/lib/librte_ring/rte_ring_generic.h
> +++ b/lib/librte_ring/rte_ring_generic.h
> @@ -167,4 +167,143 @@ __rte_ring_move_cons_head(struct rte_ring *r, unsigned
> int is_sc,
>  	return n;
>  }
>  
> +/**
> + * @internal This function updates the producer head for enqueue using
> + *	     64-bit head/tail values.
> + *
> + * @param r
> + *   A pointer to the ring structure
> + * @param is_sp
> + *   Indicates whether multi-producer path is needed or not
> + * @param n
> + *   The number of elements we will want to enqueue, i.e. how far should the
> + *   head be moved
> + * @param behavior
> + *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
> + *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
> + * @param old_head
> + *   Returns head value as it was before the move, i.e. where enqueue starts
> + * @param new_head
> + *   Returns the current/new head value i.e. where enqueue finishes
> + * @param free_entries
> + *   Returns the amount of free space in the ring BEFORE head was moved
> + * @return
> + *   Actual number of objects enqueued.
> + *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> + */
> +static __rte_always_inline unsigned int
> +__rte_ring_move_prod_head_64(struct rte_ring *r, unsigned int is_sp,
> +		unsigned int n, enum rte_ring_queue_behavior behavior,
> +		uint64_t *old_head, uint64_t *new_head,
> +		uint32_t *free_entries)
> +{
> +	const uint32_t capacity = r->capacity;
> +	unsigned int max = n;
> +	int success;
> +
> +	do {
> +		/* Reset n to the initial burst count */
> +		n = max;
> +
> +		*old_head = r->prod_64.head;
> +
> +		/* add rmb barrier to avoid load/load reorder in weak
> +		 * memory model. It is noop on x86
> +		 */
> +		rte_smp_rmb();
> +
> +		/*
> +		 *  The subtraction is done between two unsigned 64bits value
> +		 * (the result is always modulo 64 bits even if we have
> +		 * *old_head > cons_tail). So 'free_entries' is always
> between 0
> +		 * and capacity (which is < size).
> +		 */
> +		*free_entries = (capacity + r->cons_64.tail - *old_head);
> +
> +		/* check that we have enough room in ring */
> +		if (unlikely(n > *free_entries))
> +			n = (behavior == RTE_RING_QUEUE_FIXED) ?
> +					0 : *free_entries;
> +
> +		if (n == 0)
> +			return 0;
> +
> +		*new_head = *old_head + n;
> +		if (is_sp)
> +			r->prod_64.head = *new_head, success = 1;
> +		else
> +			success = rte_atomic64_cmpset(&r->prod_64.head,
> +					*old_head, *new_head);
> +	} while (unlikely(success == 0));
> +	return n;
> +}
> +
> +/**
> + * @internal This function updates the consumer head for dequeue using
> + *	     64-bit head/tail values.
> + *
> + * @param r
> + *   A pointer to the ring structure
> + * @param is_sc
> + *   Indicates whether multi-consumer path is needed or not
> + * @param n
> + *   The number of elements we will want to enqueue, i.e. how far should the
> + *   head be moved
> + * @param behavior
> + *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
> + *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
> + * @param old_head
> + *   Returns head value as it was before the move, i.e. where dequeue starts
> + * @param new_head
> + *   Returns the current/new head value i.e. where dequeue finishes
> + * @param entries
> + *   Returns the number of entries in the ring BEFORE head was moved
> + * @return
> + *   - Actual number of objects dequeued.
> + *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> + */
> +static __rte_always_inline unsigned int
> +__rte_ring_move_cons_head_64(struct rte_ring *r, unsigned int is_sc,
> +		unsigned int n, enum rte_ring_queue_behavior behavior,
> +		uint64_t *old_head, uint64_t *new_head,
> +		uint32_t *entries)
> +{
> +	unsigned int max = n;
> +	int success;
> +
> +	do {
> +		/* Restore n as it may change every loop */
> +		n = max;
> +
> +		*old_head = r->cons_64.head;
> +
> +		/* add rmb barrier to avoid load/load reorder in weak
> +		 * memory model. It is noop on x86
> +		 */
> +		rte_smp_rmb();
> +
> +		/* The subtraction is done between two unsigned 64bits value
> +		 * (the result is always modulo 64 bits even if we have
> +		 * cons_head > prod_tail). So 'entries' is always between 0
> +		 * and size(ring)-1.
> +		 */
> +		*entries = (r->prod_64.tail - *old_head);
> +
> +		/* Set the actual entries for dequeue */
> +		if (n > *entries)
> +			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 :
> *entries;
> +
> +		if (unlikely(n == 0))
> +			return 0;
> +
> +		*new_head = *old_head + n;
> +		if (is_sc)
> +			r->cons_64.head = *new_head, success = 1;
> +		else
> +			success = rte_atomic64_cmpset(&r->cons_64.head,
> +					*old_head, *new_head);
> +	} while (unlikely(success == 0));
> +	return n;
> +}
> +
>  #endif /* _RTE_RING_GENERIC_H_ */
Eads, Gage Jan. 30, 2019, 4:26 a.m. | #2
> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Tuesday, January 29, 2019 6:57 AM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; mczekaj@marvell.com; nd <nd@arm.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; stephen@networkplumber.org;
> olivier.matz@6wind.com; arybchenko@solarflare.com
> Subject: Re: [PATCH v4 1/5] ring: add 64-bit headtail structure
> 
> On Mon, 2019-01-28 at 12:14 -0600, Gage Eads wrote:
> > 64-bit head and tail index widths greatly increases the time it takes
> > for them to wrap-around (with current CPU speeds, it won't happen
> > within the author's lifetime). This is fundamental to avoiding the ABA
> > problem -- in which a thread mistakes reading the same tail index in
> > two accesses to mean that the ring was not modified in the intervening
> > time -- in the upcoming non-blocking ring implementation. Using a
> > 64-bit index makes the possibility of this occurring effectively zero.
> Just an observation.
> The following invariant holds (using ring_size instead of mask):
> ∀ index: ring[index % ring_size].index % ring_size == index % ring_size i.e. the N
> (N=log2 ring size) lsb of ring[].index will always be the same (for a specific slot)
> so serve no purpose.
> 
> This means we don't have to store the whole index in each slot, it is enough to
> store "index / ring_size" (which I call the lap counter). This could be useful for an
> implementation for 32-bit platforms which support 64-bit CAS (to write the slot
> ptr & index (lap counter) atomically) and uses 64-bit head & tail indexes (to avoid
> the quick wrap around you would have with 32-bit ring indexes).
> 
> So
> ring[index % ring_size].lap = index / ring_size;
> 
> An implementation could of course use bitwise-and instead of modulo and
> bitwise- right shift instead of division. The 2-logaritm of ring_size should also be
> pre- calcucated and stored in the ring buffer metadata.
> 

That's a pretty interesting idea. The question is, with such a design, what should DPDK's minimum NB ring size be on 32-bit platforms?

If a ring entry is written on average every M cycles, then a lap occurs every M*N cycles and each counter repeats every M*N*2^32 cycles. If M=100 on a 2GHz system, then the counter repeats every

N=1: 3.33 minutes
...
N=256: 14.22 hours
N=512: 28.44 hours
N=1024: 2.37 days
...
N=16384: 37.92 days

I think a minimum size of 1024 strikes a good balance between not too burdensome and sufficiently low odds of ABA occurring.

Thanks,
Gage

[snip]

Patch

diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index af5444a9f..00dfb5b85 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -1,6 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  *
- * Copyright (c) 2010-2017 Intel Corporation
+ * Copyright (c) 2010-2019 Intel Corporation
  * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org
  * All rights reserved.
  * Derived from FreeBSD's bufring.h
@@ -70,6 +70,15 @@  struct rte_ring_headtail {
 	uint32_t single;         /**< True if single prod/cons */
 };
 
+/* 64-bit version of rte_ring_headtail, for use by rings that need to avoid
+ * head/tail wrap-around.
+ */
+struct rte_ring_headtail_64 {
+	volatile uint64_t head;  /**< Prod/consumer head. */
+	volatile uint64_t tail;  /**< Prod/consumer tail. */
+	uint32_t single;       /**< True if single prod/cons */
+};
+
 /**
  * An RTE ring structure.
  *
@@ -97,11 +106,19 @@  struct rte_ring {
 	char pad0 __rte_cache_aligned; /**< empty cache line */
 
 	/** Ring producer status. */
-	struct rte_ring_headtail prod __rte_cache_aligned;
+	RTE_STD_C11
+	union {
+		struct rte_ring_headtail prod __rte_cache_aligned;
+		struct rte_ring_headtail_64 prod_64 __rte_cache_aligned;
+	};
 	char pad1 __rte_cache_aligned; /**< empty cache line */
 
 	/** Ring consumer status. */
-	struct rte_ring_headtail cons __rte_cache_aligned;
+	RTE_STD_C11
+	union {
+		struct rte_ring_headtail cons __rte_cache_aligned;
+		struct rte_ring_headtail_64 cons_64 __rte_cache_aligned;
+	};
 	char pad2 __rte_cache_aligned; /**< empty cache line */
 };
 
diff --git a/lib/librte_ring/rte_ring_c11_mem.h b/lib/librte_ring/rte_ring_c11_mem.h
index 0fb73a337..47acd4c7c 100644
--- a/lib/librte_ring/rte_ring_c11_mem.h
+++ b/lib/librte_ring/rte_ring_c11_mem.h
@@ -178,4 +178,157 @@  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
 	return n;
 }
 
+/**
+ * @internal This function updates the producer head for enqueue using
+ *	     64-bit head/tail values.
+ *
+ * @param r
+ *   A pointer to the ring structure
+ * @param is_sp
+ *   Indicates whether multi-producer path is needed or not
+ * @param n
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
+ *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where enqueue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where enqueue finishes
+ * @param free_entries
+ *   Returns the amount of free space in the ring BEFORE head was moved
+ * @return
+ *   Actual number of objects enqueued.
+ *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
+ */
+static __rte_always_inline unsigned int
+__rte_ring_move_prod_head_64(struct rte_ring *r, unsigned int is_sp,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint64_t *old_head, uint64_t *new_head,
+		uint32_t *free_entries)
+{
+	const uint32_t capacity = r->capacity;
+	uint64_t cons_tail;
+	unsigned int max = n;
+	int success;
+
+	*old_head = __atomic_load_n(&r->prod_64.head, __ATOMIC_RELAXED);
+	do {
+		/* Reset n to the initial burst count */
+		n = max;
+
+		/* Ensure the head is read before tail */
+		__atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+		/* load-acquire synchronize with store-release of ht->tail
+		 * in update_tail.
+		 */
+		cons_tail = __atomic_load_n(&r->cons_64.tail,
+					__ATOMIC_ACQUIRE);
+
+		/* The subtraction is done between two unsigned 32bits value
+		 * (the result is always modulo 32 bits even if we have
+		 * *old_head > cons_tail). So 'free_entries' is always between 0
+		 * and capacity (which is < size).
+		 */
+		*free_entries = (capacity + cons_tail - *old_head);
+
+		/* check that we have enough room in ring */
+		if (unlikely(n > *free_entries))
+			n = (behavior == RTE_RING_QUEUE_FIXED) ?
+					0 : *free_entries;
+
+		if (n == 0)
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sp)
+			r->prod_64.head = *new_head, success = 1;
+		else
+			/* on failure, *old_head is updated */
+			success = __atomic_compare_exchange_n(&r->prod_64.head,
+					old_head, *new_head,
+					0, __ATOMIC_RELAXED,
+					__ATOMIC_RELAXED);
+	} while (unlikely(success == 0));
+	return n;
+}
+
+/**
+ * @internal This function updates the consumer head for dequeue using
+ *	     64-bit head/tail values.
+ *
+ * @param r
+ *   A pointer to the ring structure
+ * @param is_sc
+ *   Indicates whether multi-consumer path is needed or not
+ * @param n
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
+ *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where dequeue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where dequeue finishes
+ * @param entries
+ *   Returns the number of entries in the ring BEFORE head was moved
+ * @return
+ *   - Actual number of objects dequeued.
+ *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
+ */
+static __rte_always_inline unsigned int
+__rte_ring_move_cons_head_64(struct rte_ring *r, unsigned int is_sc,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint64_t *old_head, uint64_t *new_head,
+		uint32_t *entries)
+{
+	unsigned int max = n;
+	uint64_t prod_tail;
+	int success;
+
+	/* move cons.head atomically */
+	*old_head = __atomic_load_n(&r->cons_64.head, __ATOMIC_RELAXED);
+	do {
+		/* Restore n as it may change every loop */
+		n = max;
+
+		/* Ensure the head is read before tail */
+		__atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+		/* this load-acquire synchronize with store-release of ht->tail
+		 * in update_tail.
+		 */
+		prod_tail = __atomic_load_n(&r->prod_64.tail,
+					__ATOMIC_ACQUIRE);
+
+		/* The subtraction is done between two unsigned 32bits value
+		 * (the result is always modulo 32 bits even if we have
+		 * cons_head > prod_tail). So 'entries' is always between 0
+		 * and size(ring)-1.
+		 */
+		*entries = (prod_tail - *old_head);
+
+		/* Set the actual entries for dequeue */
+		if (n > *entries)
+			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
+
+		if (unlikely(n == 0))
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sc)
+			r->cons_64.head = *new_head, success = 1;
+		else
+			/* on failure, *old_head will be updated */
+			success = __atomic_compare_exchange_n(&r->cons_64.head,
+							old_head, *new_head,
+							0, __ATOMIC_RELAXED,
+							__ATOMIC_RELAXED);
+	} while (unlikely(success == 0));
+	return n;
+}
+
 #endif /* _RTE_RING_C11_MEM_H_ */
diff --git a/lib/librte_ring/rte_ring_generic.h b/lib/librte_ring/rte_ring_generic.h
index ea7dbe5b9..2158e092a 100644
--- a/lib/librte_ring/rte_ring_generic.h
+++ b/lib/librte_ring/rte_ring_generic.h
@@ -167,4 +167,143 @@  __rte_ring_move_cons_head(struct rte_ring *r, unsigned int is_sc,
 	return n;
 }
 
+/**
+ * @internal This function updates the producer head for enqueue using
+ *	     64-bit head/tail values.
+ *
+ * @param r
+ *   A pointer to the ring structure
+ * @param is_sp
+ *   Indicates whether multi-producer path is needed or not
+ * @param n
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
+ *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where enqueue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where enqueue finishes
+ * @param free_entries
+ *   Returns the amount of free space in the ring BEFORE head was moved
+ * @return
+ *   Actual number of objects enqueued.
+ *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
+ */
+static __rte_always_inline unsigned int
+__rte_ring_move_prod_head_64(struct rte_ring *r, unsigned int is_sp,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint64_t *old_head, uint64_t *new_head,
+		uint32_t *free_entries)
+{
+	const uint32_t capacity = r->capacity;
+	unsigned int max = n;
+	int success;
+
+	do {
+		/* Reset n to the initial burst count */
+		n = max;
+
+		*old_head = r->prod_64.head;
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		rte_smp_rmb();
+
+		/*
+		 *  The subtraction is done between two unsigned 64bits value
+		 * (the result is always modulo 64 bits even if we have
+		 * *old_head > cons_tail). So 'free_entries' is always between 0
+		 * and capacity (which is < size).
+		 */
+		*free_entries = (capacity + r->cons_64.tail - *old_head);
+
+		/* check that we have enough room in ring */
+		if (unlikely(n > *free_entries))
+			n = (behavior == RTE_RING_QUEUE_FIXED) ?
+					0 : *free_entries;
+
+		if (n == 0)
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sp)
+			r->prod_64.head = *new_head, success = 1;
+		else
+			success = rte_atomic64_cmpset(&r->prod_64.head,
+					*old_head, *new_head);
+	} while (unlikely(success == 0));
+	return n;
+}
+
+/**
+ * @internal This function updates the consumer head for dequeue using
+ *	     64-bit head/tail values.
+ *
+ * @param r
+ *   A pointer to the ring structure
+ * @param is_sc
+ *   Indicates whether multi-consumer path is needed or not
+ * @param n
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
+ *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where dequeue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where dequeue finishes
+ * @param entries
+ *   Returns the number of entries in the ring BEFORE head was moved
+ * @return
+ *   - Actual number of objects dequeued.
+ *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
+ */
+static __rte_always_inline unsigned int
+__rte_ring_move_cons_head_64(struct rte_ring *r, unsigned int is_sc,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint64_t *old_head, uint64_t *new_head,
+		uint32_t *entries)
+{
+	unsigned int max = n;
+	int success;
+
+	do {
+		/* Restore n as it may change every loop */
+		n = max;
+
+		*old_head = r->cons_64.head;
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		rte_smp_rmb();
+
+		/* The subtraction is done between two unsigned 64bits value
+		 * (the result is always modulo 64 bits even if we have
+		 * cons_head > prod_tail). So 'entries' is always between 0
+		 * and size(ring)-1.
+		 */
+		*entries = (r->prod_64.tail - *old_head);
+
+		/* Set the actual entries for dequeue */
+		if (n > *entries)
+			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
+
+		if (unlikely(n == 0))
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sc)
+			r->cons_64.head = *new_head, success = 1;
+		else
+			success = rte_atomic64_cmpset(&r->cons_64.head,
+					*old_head, *new_head);
+	} while (unlikely(success == 0));
+	return n;
+}
+
 #endif /* _RTE_RING_GENERIC_H_ */