Now that we have everything in a C file, we can store the information
about our sleep, and have a native mechanism to wake up the sleeping
core. This mechanism would however only wake up a core that's sleeping
while monitoring - waking up from `rte_power_pause` won't work.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
Notes:
v16:
- Improve error handling
- Take a lock before UMONITOR
v13:
- Add comments around wakeup code to explain what it does
- Add lcore_id parameter checking to prevent buffer overrun
lib/librte_eal/arm/rte_power_intrinsics.c | 9 ++
.../include/generic/rte_power_intrinsics.h | 16 +++
lib/librte_eal/ppc/rte_power_intrinsics.c | 9 ++
lib/librte_eal/version.map | 1 +
lib/librte_eal/x86/rte_power_intrinsics.c | 98 ++++++++++++++++++-
5 files changed, 132 insertions(+), 1 deletion(-)
> Now that we have everything in a C file, we can store the information
> about our sleep, and have a native mechanism to wake up the sleeping
> core. This mechanism would however only wake up a core that's sleeping
> while monitoring - waking up from `rte_power_pause` won't work.
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>
> Notes:
> v16:
> - Improve error handling
> - Take a lock before UMONITOR
>
> v13:
> - Add comments around wakeup code to explain what it does
> - Add lcore_id parameter checking to prevent buffer overrun
>
> lib/librte_eal/arm/rte_power_intrinsics.c | 9 ++
> .../include/generic/rte_power_intrinsics.h | 16 +++
> lib/librte_eal/ppc/rte_power_intrinsics.c | 9 ++
> lib/librte_eal/version.map | 1 +
> lib/librte_eal/x86/rte_power_intrinsics.c | 98 ++++++++++++++++++-
> 5 files changed, 132 insertions(+), 1 deletion(-)
>
> --
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> 2.25.1
@@ -27,3 +27,12 @@ rte_power_pause(const uint64_t tsc_timestamp)
return -ENOTSUP;
}
+
+/**
+ * This function is not supported on ARM.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ RTE_SET_USED(lcore_id);
+}
@@ -62,6 +62,22 @@ __rte_experimental
int rte_power_monitor(const struct rte_power_monitor_cond *pmc,
const uint64_t tsc_timestamp);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wake up a specific lcore that is in a power optimized state and is monitoring
+ * an address.
+ *
+ * @note This function will *not* wake up a core that is in a power optimized
+ * state due to calling `rte_power_pause`.
+ *
+ * @param lcore_id
+ * Lcore ID of a sleeping thread.
+ */
+__rte_experimental
+int rte_power_monitor_wakeup(const unsigned int lcore_id);
+
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
@@ -27,3 +27,12 @@ rte_power_pause(const uint64_t tsc_timestamp)
return -ENOTSUP;
}
+
+/**
+ * This function is not supported on PPC64.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ RTE_SET_USED(lcore_id);
+}
@@ -406,6 +406,7 @@ EXPERIMENTAL {
# added in 21.02
rte_power_monitor;
+ rte_power_monitor_wakeup;
rte_power_pause;
rte_thread_tls_key_create;
rte_thread_tls_key_delete;
@@ -2,8 +2,31 @@
* Copyright(c) 2020 Intel Corporation
*/
+#include <rte_common.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+
#include "rte_power_intrinsics.h"
+/*
+ * Per-lcore structure holding current status of C0.2 sleeps.
+ */
+static struct power_wait_status {
+ rte_spinlock_t lock;
+ volatile void *monitor_addr; /**< NULL if not currently sleeping */
+} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
+
+static inline void
+__umwait_wakeup(volatile void *addr)
+{
+ uint64_t val;
+
+ /* trigger a write but don't change the value */
+ val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
+ __atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+}
+
static bool wait_supported;
static inline uint64_t
@@ -51,17 +74,29 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
{
const uint32_t tsc_l = (uint32_t)tsc_timestamp;
const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
+ const unsigned int lcore_id = rte_lcore_id();
+ struct power_wait_status *s;
/* prevent user from running this instruction if it's not supported */
if (!wait_supported)
return -ENOTSUP;
+ /* prevent non-EAL thread from using this API */
+ if (lcore_id >= RTE_MAX_LCORE)
+ return -EINVAL;
+
if (pmc == NULL)
return -EINVAL;
if (__check_val_size(pmc->data_sz) < 0)
return -EINVAL;
+ s = &wait_status[lcore_id];
+
+ /* update sleep address */
+ rte_spinlock_lock(&s->lock);
+ s->monitor_addr = pmc->addr;
+
/*
* we're using raw byte codes for now as only the newest compiler
* versions support this instruction natively.
@@ -72,21 +107,37 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
:
: "D"(pmc->addr));
+ /* now that we've put this address into monitor, we can unlock */
+ rte_spinlock_unlock(&s->lock);
+
+ /* if we have a comparison mask, we might not need to sleep at all */
if (pmc->mask) {
const uint64_t cur_value = __get_umwait_val(
pmc->addr, pmc->data_sz);
const uint64_t masked = cur_value & pmc->mask;
/* if the masked value is already matching, abort */
- if (masked == pmc->val)
+ if (masked == pmc->val) {
+ /* erase sleep address */
+ rte_spinlock_lock(&s->lock);
+ s->monitor_addr = NULL;
+ rte_spinlock_unlock(&s->lock);
+
return 0;
+ }
}
+
/* execute UMWAIT */
asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
: /* ignore rflags */
: "D"(0), /* enter C0.2 */
"a"(tsc_l), "d"(tsc_h));
+ /* erase sleep address */
+ rte_spinlock_lock(&s->lock);
+ s->monitor_addr = NULL;
+ rte_spinlock_unlock(&s->lock);
+
return 0;
}
@@ -122,3 +173,48 @@ RTE_INIT(rte_power_intrinsics_init) {
if (i.power_monitor && i.power_pause)
wait_supported = 1;
}
+
+int
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ struct power_wait_status *s;
+
+ /* prevent user from running this instruction if it's not supported */
+ if (!wait_supported)
+ return -ENOTSUP;
+
+ /* prevent buffer overrun */
+ if (lcore_id >= RTE_MAX_LCORE)
+ return -EINVAL;
+
+ s = &wait_status[lcore_id];
+
+ /*
+ * There is a race condition between sleep, wakeup and locking, but we
+ * don't need to handle it.
+ *
+ * Possible situations:
+ *
+ * 1. T1 locks, sets address, unlocks
+ * 2. T2 locks, triggers wakeup, unlocks
+ * 3. T1 sleeps
+ *
+ * In this case, because T1 has already set the address for monitoring,
+ * we will wake up immediately even if T2 triggers wakeup before T1
+ * goes to sleep.
+ *
+ * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
+ * 2. T2 locks, triggers wakeup, and unlocks
+ * 3. T1 locks, erases address, and unlocks
+ *
+ * In this case, since we've already woken up, the "wakeup" was
+ * unneeded, and since T1 is still waiting on T2 releasing the lock, the
+ * wakeup address is still valid so it's perfectly safe to write it.
+ */
+ rte_spinlock_lock(&s->lock);
+ if (s->monitor_addr != NULL)
+ __umwait_wakeup(s->monitor_addr);
+ rte_spinlock_unlock(&s->lock);
+
+ return 0;
+}