[v4,1/3] random: add rte_drand() function
Checks
Commit Message
The PIE code and other applications can benefit from having a
fast way to get a random floating point value. This new function
is equivalent to drand() in the standard library.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
app/test/test_rand_perf.c | 7 +++++
doc/guides/rel_notes/release_22_07.rst | 5 ++++
lib/eal/common/rte_random.c | 41 ++++++++++++++++++++++++++
lib/eal/include/rte_random.h | 18 +++++++++++
lib/eal/meson.build | 3 ++
lib/eal/version.map | 1 +
6 files changed, 75 insertions(+)
Comments
Stephen Hemminger <stephen@networkplumber.org> writes:
> The PIE code and other applications can benefit from having a
> fast way to get a random floating point value. This new function
> is equivalent to drand() in the standard library.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> app/test/test_rand_perf.c | 7 +++++
> doc/guides/rel_notes/release_22_07.rst | 5 ++++
> lib/eal/common/rte_random.c | 41 ++++++++++++++++++++++++++
> lib/eal/include/rte_random.h | 18 +++++++++++
> lib/eal/meson.build | 3 ++
> lib/eal/version.map | 1 +
> 6 files changed, 75 insertions(+)
>
Acked-by: Ray Kinsella <mdr@ashoe.eu>
On 2022-05-25 22:31, Stephen Hemminger wrote:
> The PIE code and other applications can benefit from having a
> fast way to get a random floating point value. This new function
> is equivalent to drand() in the standard library.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> app/test/test_rand_perf.c | 7 +++++
> doc/guides/rel_notes/release_22_07.rst | 5 ++++
> lib/eal/common/rte_random.c | 41 ++++++++++++++++++++++++++
> lib/eal/include/rte_random.h | 18 +++++++++++
> lib/eal/meson.build | 3 ++
> lib/eal/version.map | 1 +
> 6 files changed, 75 insertions(+)
>
> diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
> index fe797ebfa1ca..26fb1d9a586e 100644
> --- a/app/test/test_rand_perf.c
> +++ b/app/test/test_rand_perf.c
> @@ -20,6 +20,7 @@ static volatile uint64_t vsum;
>
> enum rand_type {
> rand_type_64,
> + rand_type_float,
> rand_type_bounded_best_case,
> rand_type_bounded_worst_case
> };
> @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
> switch (rand_type) {
> case rand_type_64:
> return "Full 64-bit [rte_rand()]";
> + case rand_type_float:
> + return "Floating point [rte_drand()]";
> case rand_type_bounded_best_case:
> return "Bounded average best-case [rte_rand_max()]";
> case rand_type_bounded_worst_case:
> @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
> case rand_type_64:
> sum += rte_rand();
> break;
> + case rand_type_float:
> + sum += 1000. * rte_drand();
Including this floating point multiplication will lead to an
overestimation of rte_drand() latency.
You could refactor this function to be a macro, and pass the return type
to as a parameter to this macro. I did just that, and on both an AMD
5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think
it's necessary.
> + break;
> case rand_type_bounded_best_case:
> sum += rte_rand_max(BEST_CASE_BOUND);
> break;
> @@ -83,6 +89,7 @@ test_rand_perf(void)
> printf("Pseudo-random number generation latencies:\n");
>
> test_rand_perf_type(rand_type_64);
> + test_rand_perf_type(rand_type_float);
> test_rand_perf_type(rand_type_bounded_best_case);
> test_rand_perf_type(rand_type_bounded_worst_case);
>
> diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> index e49cacecefd4..b131ea577226 100644
> --- a/doc/guides/rel_notes/release_22_07.rst
> +++ b/doc/guides/rel_notes/release_22_07.rst
> @@ -104,6 +104,11 @@ New Features
> * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
> * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
>
> +* ** Added function get random floating point number.**
> +
> + Added the function ``rte_drand()`` to provide a pseudo-random
> + floating point number.
> +
>
> Removed Items
> -------------
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 4535cc980cec..3dc3484ee655 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -6,6 +6,9 @@
> #include <x86intrin.h>
> #endif
> #include <unistd.h>
> +#ifdef RTE_LIBEAL_USE_IEEE754
> +#include <ieee754.h>
> +#endif
>
> #include <rte_branch_prediction.h>
> #include <rte_cycles.h>
> @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
> return res;
> }
>
> +double
> +rte_drand(void)
> +{
> + struct rte_rand_state *state = __rte_rand_get_state();
> + uint64_t rand64 = __rte_rand_lfsr258(state);
> +#ifdef RTE_LIBEAL_USE_IEEE754
> + union ieee754_double u = {
> + .ieee = {
> + .negative = 0,
> + .exponent = IEEE754_DOUBLE_BIAS,
> + },
> + };
> +
> + /* Take 64 bit random value and put it into the mantissa
> + * This uses direct access to IEEE format to avoid doing
> + * any direct floating point math here.
> + */
> + u.ieee.mantissa0 = rand64 >> 32;
> + u.ieee.mantissa1 = rand64;
> +
> + return u.d - 1.0;
> +#else
> + /* Slower method requiring floating point divide
> + *
Do you know how much slower? I ran rand_perf_test on two of my systems.
AMD 5900X Pi4 (ARM Cortex-A72)
IEEE754 version 12 1.19
Non-IEEE754 version 11 1.16
Naive version* 24 1.16
* (double)rte_rand() / (double)UINT64_MAX
Numbers are TSC cycles/op.
Surprisingly, it seems like the IEEE754 version is slower on both of
these machines.
Do you have a machine (or a different use case) where the supposedly
more optimized version actually runs faster?
> + * The double mantissa only has 53 bits, so we uniformly mask off the
> + * high 11 bits and then floating-point divide by 2^53 to achieve a
> + * result in [0, 1).
> + *
> + * We are not allowed to emit 1.0, so denom must be one greater than
> + * the possible range of the preceeding step.
> + */
> + static const uint64_t denom = (uint64_t)1 << 53;
Remove "static const". Surely, this can't make a difference (at least
not in a positive direction).
> +
> + rand64 &= denom - 1;
> + return (double)rand64 / denom;
> +#endif
> +}
> +
> static uint64_t
> __rte_random_initial_seed(void)
> {
> diff --git a/lib/eal/include/rte_random.h b/lib/eal/include/rte_random.h
> index 29f5f1325a30..f6541c2b0f08 100644
> --- a/lib/eal/include/rte_random.h
> +++ b/lib/eal/include/rte_random.h
> @@ -65,6 +65,24 @@ rte_rand(void);
> uint64_t
> rte_rand_max(uint64_t upper_bound);
>
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Generates a pseudo-random floating point number.
> + *
> + * This function returns a nonnegative double-precision floating random
> + * number uniformly distributed over the interval [0.0, 1.0).
> + *
> + * The generator is not cryptographically secure.
> + * If called from lcore threads, this function is thread-safe.
> + *
> + * @return
> + * A pseudo-random value between 0 and 1.0.
> + */
> +__rte_experimental
> +double rte_drand(void);
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/lib/eal/meson.build b/lib/eal/meson.build
> index 056beb946119..e50524901c98 100644
> --- a/lib/eal/meson.build
> +++ b/lib/eal/meson.build
> @@ -32,3 +32,6 @@ endif
> if cc.has_function('getentropy', prefix : '#include <unistd.h>')
> cflags += '-DRTE_LIBEAL_USE_GETENTROPY'
> endif
> +if cc.has_header_symbol('ieee754.h', 'union ieee754_double')
> + cflags += '-DRTE_LIBEAL_USE_IEEE754'
> +endif
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index d49e30bd042f..cfbade9a33e9 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -422,6 +422,7 @@ EXPERIMENTAL {
> rte_intr_type_set;
>
> # added in 22.07
> + rte_drand;
> rte_thread_get_affinity_by_id;
> rte_thread_self;
> rte_thread_set_affinity_by_id;
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> > @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
> > case rand_type_64:
> > sum += rte_rand();
> > break;
> > + case rand_type_float:
> > + sum += 1000. * rte_drand();
>
> Including this floating point multiplication will lead to an
> overestimation of rte_drand() latency.
>
> You could refactor this function to be a macro, and pass the return type
> to as a parameter to this macro. I did just that, and on both an AMD
> 5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think
> it's necessary.
The test is not doing anything useful with the result.
It is just a way to exercise the code.
Macros are evil, have little or no typechecking and should be avoided.
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> > +#else
> > + /* Slower method requiring floating point divide
> > + *
>
> Do you know how much slower? I ran rand_perf_test on two of my systems.
>
> AMD 5900X Pi4 (ARM Cortex-A72)
> IEEE754 version 12 1.19
> Non-IEEE754 version 11 1.16
> Naive version* 24 1.16
>
> * (double)rte_rand() / (double)UINT64_MAX
>
> Numbers are TSC cycles/op.
>
> Surprisingly, it seems like the IEEE754 version is slower on both of
> these machines.
>
> Do you have a machine (or a different use case) where the supposedly
> more optimized version actually runs faster?
The direct method is based off the concept used by glibc and others
and the divide (including spelling error) are from FreeBSD.
Be careful with micro benchmarks. A better one would be do
rte_drand() compared with something to check whether it is in range.
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> On 2022-05-25 22:31, Stephen Hemminger wrote:
> > The PIE code and other applications can benefit from having a
> > fast way to get a random floating point value. This new function
> > is equivalent to drand() in the standard library.
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> > app/test/test_rand_perf.c | 7 +++++
> > doc/guides/rel_notes/release_22_07.rst | 5 ++++
> > lib/eal/common/rte_random.c | 41 ++++++++++++++++++++++++++
> > lib/eal/include/rte_random.h | 18 +++++++++++
> > lib/eal/meson.build | 3 ++
> > lib/eal/version.map | 1 +
> > 6 files changed, 75 insertions(+)
> >
> > diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
> > index fe797ebfa1ca..26fb1d9a586e 100644
> > --- a/app/test/test_rand_perf.c
> > +++ b/app/test/test_rand_perf.c
> > @@ -20,6 +20,7 @@ static volatile uint64_t vsum;
> >
> > enum rand_type {
> > rand_type_64,
> > + rand_type_float,
> > rand_type_bounded_best_case,
> > rand_type_bounded_worst_case
> > };
> > @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
> > switch (rand_type) {
> > case rand_type_64:
> > return "Full 64-bit [rte_rand()]";
> > + case rand_type_float:
> > + return "Floating point [rte_drand()]";
> > case rand_type_bounded_best_case:
> > return "Bounded average best-case [rte_rand_max()]";
> > case rand_type_bounded_worst_case:
> > @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
> > case rand_type_64:
> > sum += rte_rand();
> > break;
> > + case rand_type_float:
> > + sum += 1000. * rte_drand();
>
> Including this floating point multiplication will lead to an
> overestimation of rte_drand() latency.
>
> You could refactor this function to be a macro, and pass the return type
> to as a parameter to this macro. I did just that, and on both an AMD
> 5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think
> it's necessary.
>
> > + break;
> > case rand_type_bounded_best_case:
> > sum += rte_rand_max(BEST_CASE_BOUND);
> > break;
> > @@ -83,6 +89,7 @@ test_rand_perf(void)
> > printf("Pseudo-random number generation latencies:\n");
> >
> > test_rand_perf_type(rand_type_64);
> > + test_rand_perf_type(rand_type_float);
> > test_rand_perf_type(rand_type_bounded_best_case);
> > test_rand_perf_type(rand_type_bounded_worst_case);
> >
> > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> > index e49cacecefd4..b131ea577226 100644
> > --- a/doc/guides/rel_notes/release_22_07.rst
> > +++ b/doc/guides/rel_notes/release_22_07.rst
> > @@ -104,6 +104,11 @@ New Features
> > * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
> > * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
> >
> > +* ** Added function get random floating point number.**
> > +
> > + Added the function ``rte_drand()`` to provide a pseudo-random
> > + floating point number.
> > +
> >
> > Removed Items
> > -------------
> > diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> > index 4535cc980cec..3dc3484ee655 100644
> > --- a/lib/eal/common/rte_random.c
> > +++ b/lib/eal/common/rte_random.c
> > @@ -6,6 +6,9 @@
> > #include <x86intrin.h>
> > #endif
> > #include <unistd.h>
> > +#ifdef RTE_LIBEAL_USE_IEEE754
> > +#include <ieee754.h>
> > +#endif
> >
> > #include <rte_branch_prediction.h>
> > #include <rte_cycles.h>
> > @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
> > return res;
> > }
> >
> > +double
> > +rte_drand(void)
> > +{
> > + struct rte_rand_state *state = __rte_rand_get_state();
> > + uint64_t rand64 = __rte_rand_lfsr258(state);
> > +#ifdef RTE_LIBEAL_USE_IEEE754
> > + union ieee754_double u = {
> > + .ieee = {
> > + .negative = 0,
> > + .exponent = IEEE754_DOUBLE_BIAS,
> > + },
> > + };
> > +
> > + /* Take 64 bit random value and put it into the mantissa
> > + * This uses direct access to IEEE format to avoid doing
> > + * any direct floating point math here.
> > + */
> > + u.ieee.mantissa0 = rand64 >> 32;
> > + u.ieee.mantissa1 = rand64;
> > +
> > + return u.d - 1.0;
> > +#else
> > + /* Slower method requiring floating point divide
> > + *
>
> Do you know how much slower? I ran rand_perf_test on two of my systems.
>
> AMD 5900X Pi4 (ARM Cortex-A72)
> IEEE754 version 12 1.19
> Non-IEEE754 version 11 1.16
> Naive version* 24 1.16
>
> * (double)rte_rand() / (double)UINT64_MAX
>
> Numbers are TSC cycles/op.
On AMD Ryzen 7 both versions take 9 cycles/op with the rand_perf_autotest
So it is a toss up.
The 754 version is:
ubfx r1, r1, #0, #20
orr r3, r1, #1069547520 << mantissa0
mov r2, r0
orr r3, r3, #3145728
vmov.f64 d0, #1.0e+0
vmov d16, r2, r3
vsub.f64 d0, d16, d0 << return u.d - 1.0
Note: the compiler is doing smart optimization on the divide version.
It knows that since denominator is fixed value it can use multiply.
vmov d16, r0, r1
vmul.f64 d0, d16, d0
@@ -20,6 +20,7 @@ static volatile uint64_t vsum;
enum rand_type {
rand_type_64,
+ rand_type_float,
rand_type_bounded_best_case,
rand_type_bounded_worst_case
};
@@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
switch (rand_type) {
case rand_type_64:
return "Full 64-bit [rte_rand()]";
+ case rand_type_float:
+ return "Floating point [rte_drand()]";
case rand_type_bounded_best_case:
return "Bounded average best-case [rte_rand_max()]";
case rand_type_bounded_worst_case:
@@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
case rand_type_64:
sum += rte_rand();
break;
+ case rand_type_float:
+ sum += 1000. * rte_drand();
+ break;
case rand_type_bounded_best_case:
sum += rte_rand_max(BEST_CASE_BOUND);
break;
@@ -83,6 +89,7 @@ test_rand_perf(void)
printf("Pseudo-random number generation latencies:\n");
test_rand_perf_type(rand_type_64);
+ test_rand_perf_type(rand_type_float);
test_rand_perf_type(rand_type_bounded_best_case);
test_rand_perf_type(rand_type_bounded_worst_case);
@@ -104,6 +104,11 @@ New Features
* ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
* ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
+* ** Added function get random floating point number.**
+
+ Added the function ``rte_drand()`` to provide a pseudo-random
+ floating point number.
+
Removed Items
-------------
@@ -6,6 +6,9 @@
#include <x86intrin.h>
#endif
#include <unistd.h>
+#ifdef RTE_LIBEAL_USE_IEEE754
+#include <ieee754.h>
+#endif
#include <rte_branch_prediction.h>
#include <rte_cycles.h>
@@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
return res;
}
+double
+rte_drand(void)
+{
+ struct rte_rand_state *state = __rte_rand_get_state();
+ uint64_t rand64 = __rte_rand_lfsr258(state);
+#ifdef RTE_LIBEAL_USE_IEEE754
+ union ieee754_double u = {
+ .ieee = {
+ .negative = 0,
+ .exponent = IEEE754_DOUBLE_BIAS,
+ },
+ };
+
+ /* Take 64 bit random value and put it into the mantissa
+ * This uses direct access to IEEE format to avoid doing
+ * any direct floating point math here.
+ */
+ u.ieee.mantissa0 = rand64 >> 32;
+ u.ieee.mantissa1 = rand64;
+
+ return u.d - 1.0;
+#else
+ /* Slower method requiring floating point divide
+ *
+ * The double mantissa only has 53 bits, so we uniformly mask off the
+ * high 11 bits and then floating-point divide by 2^53 to achieve a
+ * result in [0, 1).
+ *
+ * We are not allowed to emit 1.0, so denom must be one greater than
+ * the possible range of the preceeding step.
+ */
+ static const uint64_t denom = (uint64_t)1 << 53;
+
+ rand64 &= denom - 1;
+ return (double)rand64 / denom;
+#endif
+}
+
static uint64_t
__rte_random_initial_seed(void)
{
@@ -65,6 +65,24 @@ rte_rand(void);
uint64_t
rte_rand_max(uint64_t upper_bound);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Generates a pseudo-random floating point number.
+ *
+ * This function returns a nonnegative double-precision floating random
+ * number uniformly distributed over the interval [0.0, 1.0).
+ *
+ * The generator is not cryptographically secure.
+ * If called from lcore threads, this function is thread-safe.
+ *
+ * @return
+ * A pseudo-random value between 0 and 1.0.
+ */
+__rte_experimental
+double rte_drand(void);
+
#ifdef __cplusplus
}
#endif
@@ -32,3 +32,6 @@ endif
if cc.has_function('getentropy', prefix : '#include <unistd.h>')
cflags += '-DRTE_LIBEAL_USE_GETENTROPY'
endif
+if cc.has_header_symbol('ieee754.h', 'union ieee754_double')
+ cflags += '-DRTE_LIBEAL_USE_IEEE754'
+endif
@@ -422,6 +422,7 @@ EXPERIMENTAL {
rte_intr_type_set;
# added in 22.07
+ rte_drand;
rte_thread_get_affinity_by_id;
rte_thread_self;
rte_thread_set_affinity_by_id;